From 350afa8a1101f62ce31bc4ed6f69cf4b90ec4fa2 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 8 Aug 2024 06:29:34 +0000 Subject: x86/split_lock: Move Split and Bus lock code to a dedicated file Bus Lock Detect functionality on AMD platforms works identical to Intel. Move split_lock and bus_lock specific code from intel.c to a dedicated file so that it can be compiled and supported on non-Intel platforms. Also, introduce CONFIG_X86_BUS_LOCK_DETECT, make it dependent on CONFIG_CPU_SUP_INTEL and add compilation dependency of the new bus_lock.c file on CONFIG_X86_BUS_LOCK_DETECT. Signed-off-by: Ravi Bangoria Signed-off-by: Thomas Gleixner Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/all/20240808062937.1149-2-ravi.bangoria@amd.com --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f8d150343d42..d606b2a1de99 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -976,7 +976,7 @@ struct task_struct { #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT -- cgit v1.2.3 From 57d298bdb46bc240498675a7f887fc71089da2c0 Mon Sep 17 00:00:00 2001 From: Mikko Perttunen Date: Thu, 25 Apr 2024 08:02:36 +0300 Subject: gpu: host1x: Add MLOCK recovery for rest of engines Add class IDs / MLOCKs for MLOCK recovery for rest of engines present on Tegra234. Signed-off-by: Mikko Perttunen Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240425050238.2943404-4-cyndis@kapsi.fi --- include/linux/host1x.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 9c8119ed13a4..5a7a81e5f9bd 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -14,12 +14,17 @@ enum host1x_class { HOST1X_CLASS_HOST1X = 0x1, + HOST1X_CLASS_NVJPG1 = 0x7, + HOST1X_CLASS_NVENC = 0x21, + HOST1X_CLASS_NVENC1 = 0x22, HOST1X_CLASS_GR2D = 0x51, HOST1X_CLASS_GR2D_SB = 0x52, HOST1X_CLASS_VIC = 0x5D, HOST1X_CLASS_GR3D = 0x60, + HOST1X_CLASS_NVJPG = 0xC0, HOST1X_CLASS_NVDEC = 0xF0, HOST1X_CLASS_NVDEC1 = 0xF5, + HOST1X_CLASS_OFA = 0xF8, }; struct host1x; -- cgit v1.2.3 From 3d70eb8e60c6d009c988c8e696cb68c77ff59628 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 23 Aug 2024 16:07:24 +0800 Subject: gpu: host1x: Make host1x_context_device_bus_type constant Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the host1x_context_device_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Kunwu Chan Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240823080724.148423-1-kunwu.chan@linux.dev --- include/linux/host1x_context_bus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/host1x_context_bus.h b/include/linux/host1x_context_bus.h index 72462737a6db..c928cb432680 100644 --- a/include/linux/host1x_context_bus.h +++ b/include/linux/host1x_context_bus.h @@ -9,7 +9,7 @@ #include #ifdef CONFIG_TEGRA_HOST1X_CONTEXT_BUS -extern struct bus_type host1x_context_device_bus_type; +extern const struct bus_type host1x_context_device_bus_type; #endif #endif -- cgit v1.2.3 From a401bd1264b400f96a4cf61ed3fc144008e97a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 26 Aug 2024 14:25:39 +0200 Subject: dma-buf: give examples of error codes to use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dma_fence_set_error() function allows to set an error code on a dma_fence object before it is signaled. Document some of the potential error codes drivers should use and especially what they mean. Signed-off-by: Christian König Acked-by: Daniel Vetter Reviewed-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20240826122541.85663-2-christian.koenig@amd.com --- include/linux/dma-fence.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index e06bad467f55..e7ad819962e3 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -574,6 +574,12 @@ int dma_fence_get_status(struct dma_fence *fence); * rather than success. This must be set before signaling (so that the value * is visible before any waiters on the signal callback are woken). This * helper exists to help catching erroneous setting of #dma_fence.error. + * + * Examples of error codes which drivers should use: + * + * * %-ENODATA This operation produced no data, no other operation affected. + * * %-ECANCELED All operations from the same context have been canceled. + * * %-ETIME Operation caused a timeout and potentially device reset. */ static inline void dma_fence_set_error(struct dma_fence *fence, int error) -- cgit v1.2.3 From 8cf9a01edc216b16b5839eb793ac544d2c97ce97 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 11 Sep 2024 15:42:57 -0400 Subject: fs: Introduce FOP_ASYNC_LOCK Some lock managers (NLM, kNFSD) fastidiously avoid blocking their kernel threads while servicing blocking locks. If a filesystem supports asynchronous lock requests those lock managers can use notifications to quickly inform clients they have acquired a file lock. Historically, only posix_lock_file() was capable of supporting asynchronous locks so the check for support was simply file_operations->lock(), but with recent changes in DLM, both GFS2 and OCFS2 also support asynchronous locks and have started signalling their support with EXPORT_OP_ASYNC_LOCK. We recently noticed that those changes dropped the checks for whether a filesystem simply defaults to posix_lock_file(), so async lock notifications have not been attempted for NLM and NFSv4.1+ for most filesystems. While trying to fix this it has become clear that testing both the export flag combined with testing ->lock() creates quite a layering mess. It seems appropriate to signal support with a fop_flag. Add FOP_ASYNC_LOCK so that filesystems with ->lock() can signal their capability to handle lock requests asynchronously. Add a helper for lock managers to properly test that support. Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/3330d5a324abe2ce9c1dafe89cacdc6db41945d1.1726083391.git.bcodding@redhat.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/filelock.h | 5 +++++ include/linux/fs.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index daee999d05f3..58c1120a8253 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -180,6 +180,11 @@ static inline void locks_wake_up(struct file_lock *fl) wake_up(&fl->c.flc_wait); } +static inline bool locks_can_async_lock(const struct file_operations *fops) +{ + return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK; +} + /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6ca11e241a24..78221ae589d9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2074,6 +2074,8 @@ struct file_operations { #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) +/* Supports asynchronous lock callbacks */ +#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 5)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, -- cgit v1.2.3 From 2b0996c7646293c71c1297c00e07e54cee9bec5c Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Fri, 6 Sep 2024 12:53:59 -0700 Subject: wifi: ath9k: remove ath9k_platform_data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completely unused here in favor of Device Tree based setup. The DT code in here should currently match what is available with platform files. Any such lapse can always be added. Signed-off-by: Rosen Penev Acked-by: Toke Høiland-Jørgensen Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20240906195359.6982-4-rosenp@gmail.com --- include/linux/ath9k_platform.h | 51 ------------------------------------------ 1 file changed, 51 deletions(-) delete mode 100644 include/linux/ath9k_platform.h (limited to 'include/linux') diff --git a/include/linux/ath9k_platform.h b/include/linux/ath9k_platform.h deleted file mode 100644 index 76860a461ed2..000000000000 --- a/include/linux/ath9k_platform.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2008 Atheros Communications Inc. - * Copyright (c) 2009 Gabor Juhos - * Copyright (c) 2009 Imre Kaloz - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef _LINUX_ATH9K_PLATFORM_H -#define _LINUX_ATH9K_PLATFORM_H - -#define ATH9K_PLAT_EEP_MAX_WORDS 2048 - -struct ath9k_platform_data { - const char *eeprom_name; - - u16 eeprom_data[ATH9K_PLAT_EEP_MAX_WORDS]; - u8 *macaddr; - - int led_pin; - u32 gpio_mask; - u32 gpio_val; - - u32 bt_active_pin; - u32 bt_priority_pin; - u32 wlan_active_pin; - - bool endian_check; - bool is_clk_25mhz; - bool tx_gain_buffalo; - bool disable_2ghz; - bool disable_5ghz; - bool led_active_high; - - int (*get_mac_revision)(void); - int (*external_reset)(void); - - bool use_eeprom; -}; - -#endif /* _LINUX_ATH9K_PLATFORM_H */ -- cgit v1.2.3 From 1e436f4fff1fd1fcc904ee18139f7e284001dc81 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Tue, 17 Sep 2024 14:47:32 +0000 Subject: drm/scheduler: Improve documentation Function drm_sched_entity_push_job() doesn't have a return value, remove the return value description for it. Correct several other typo errors. v2 (Philipp): - more correction with related comments. Signed-off-by: Shuicheng Lin Reviewed-by: Philipp Stanner Signed-off-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20240917144732.2758572-1-shuicheng.lin@intel.com --- include/linux/dma-resv.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h index 8d0e34dad446..c5ab6fd9ebe8 100644 --- a/include/linux/dma-resv.h +++ b/include/linux/dma-resv.h @@ -105,10 +105,10 @@ enum dma_resv_usage { * This should be used by submissions which don't want to participate in * any implicit synchronization. * - * The most common case are preemption fences, page table updates, TLB - * flushes as well as explicit synced user submissions. + * The most common cases are preemption fences, page table updates, TLB + * flushes as well as explicitly synced user submissions. * - * Explicit synced user user submissions can be promoted to + * Explicitly synced user submissions can be promoted to * DMA_RESV_USAGE_READ or DMA_RESV_USAGE_WRITE as needed using * dma_buf_import_sync_file() when implicit synchronization should * become necessary after initial adding of the fence. -- cgit v1.2.3 From b4ad4ef374d66cc8df3188bb1ddb65bce5fc9e50 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 16 Sep 2024 15:33:20 +0200 Subject: gpu: host1x: Set up device DMA parameters In order to store device DMA parameters, the DMA framework depends on the device's dma_parms field to point at a valid memory location. Add backing storage for this in struct host1x_memory_context and point to it. Reported-by: Jonathan Hunter Reviewed-by: Christoph Hellwig Tested-by: Jon Hunter Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20240916133320.368620-1-thierry.reding@gmail.com --- include/linux/host1x.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 5a7a81e5f9bd..9fa9c30a34e6 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -471,6 +471,7 @@ struct host1x_memory_context { refcount_t ref; struct pid *owner; + struct device_dma_parameters dma_parms; struct device dev; u64 dma_mask; u32 stream_id; -- cgit v1.2.3 From 1cc2e1faafb3b5a2be25112559bdb495736b5af7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 20 Sep 2024 10:57:57 +0200 Subject: pwm: Add more locking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This ensures that a pwm_chip that has no corresponding driver isn't used and that a driver doesn't go away while a callback is still running. In the presence of device links this isn't necessary yet (so this is no fix) but for pwm character device support this is needed. To not serialize all pwm_apply_state() calls, this introduces a per chip lock. An additional complication is that for atomic chips a mutex cannot be used (as pwm_apply_atomic() must not sleep) and a spinlock cannot be held while calling an operation for a sleeping chip. So depending on the chip being atomic or not a spinlock or a mutex is used. An additional change implemented here is that on driver remove the .free() callback is called for each requested pwm_device. This is the right time because later (e.g. when the consumer calls pwm_put()) the free function is (maybe) not available any more. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/026aa891c8270a11723a1ba7e4256f456f7e1e86.1726819463.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8acd60b53f58..3ea73e075abe 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -275,6 +275,9 @@ struct pwm_ops { * @of_xlate: request a PWM device given a device tree PWM specifier * @atomic: can the driver's ->apply() be called in atomic context * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip + * @operational: signals if the chip can be used (or is already deregistered) + * @nonatomic_lock: mutex for nonatomic chips + * @atomic_lock: mutex for atomic chips * @pwms: array of PWM devices allocated by the framework */ struct pwm_chip { @@ -290,6 +293,16 @@ struct pwm_chip { /* only used internally by the PWM framework */ bool uses_pwmchip_alloc; + bool operational; + union { + /* + * depending on the chip being atomic or not either the mutex or + * the spinlock is used. It protects .operational and + * synchronizes the callbacks in .ops + */ + struct mutex nonatomic_lock; + spinlock_t atomic_lock; + }; struct pwm_device pwms[] __counted_by(npwm); }; -- cgit v1.2.3 From 17e40c25158f2505cbcdeda96624afcbab4af368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 20 Sep 2024 10:57:58 +0200 Subject: pwm: New abstraction for PWM waveforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Up to now the configuration of a PWM setting is described exclusively by a struct pwm_state which contains information about period, duty_cycle, polarity and if the PWM is enabled. (There is another member usage_power which doesn't completely fit into pwm_state, I ignore it here for simplicity.) Instead of a polarity the new abstraction has a member duty_offset_ns that defines when the rising edge happens after the period start. This is more general, as with a pwm_state the rising edge can only happen at the period's start or such that the falling edge is at the end of the period (i.e. duty_offset_ns == 0 or duty_offset_ns == period_length_ns - duty_length_ns). A disabled PWM is modeled by .period_length_ns = 0. In my eyes this is a nice usage of that otherwise unusable setting, as it doesn't define anything about the future which matches the fact that consumers should consider the state of the output as undefined and it's just there to say "No further requirements about the output, you can save some power.". Further I renamed period and duty_cycle to period_length_ns and duty_length_ns. In the past there was confusion from time to time about duty_cycle being measured in nanoseconds because people expected a percentage of period instead. With "length_ns" as suffix the semantic should be more obvious to people unfamiliar with the pwm subsystem. period is renamed to period_length_ns for consistency. The API for consumers doesn't change yet, but lowlevel drivers can implement callbacks that work with pwm_waveforms instead of pwm_states. A new thing about these callbacks is that the calculation of hardware settings needed to implement a certain waveform is separated from actually writing these settings. The motivation for that is that this allows a consumer to query the hardware capabilities without actually modifying the hardware state. The rounding rules that are expected to be implemented in the round_waveform_tohw() are: First pick the biggest possible period not bigger than wf->period_length_ns. For that period pick the biggest possible duty setting not bigger than wf->duty_length_ns. Third pick the biggest possible offset not bigger than wf->duty_offset_ns. If the requested period is too small for the hardware, it's expected that a setting with the minimal period and duty_length_ns = duty_offset_ns = 0 is returned and this fact is signaled by a return value of 1. Signed-off-by: Uwe Kleine-König Tested-by: Trevor Gamblin Link: https://lore.kernel.org/r/df0faa33bf9e7c9e2e5eab8d31bbf61e861bd401.1726819463.git.u.kleine-koenig@baylibre.com [ukleinek: Update pwm_check_rounding() to return bool instead of int.] Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 3ea73e075abe..d8cfe1c9b19d 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -49,6 +49,31 @@ enum { PWMF_EXPORTED = 1, }; +/** + * struct pwm_waveform - description of a PWM waveform + * @period_length_ns: PWM period + * @duty_length_ns: PWM duty cycle + * @duty_offset_ns: offset of the rising edge from the period's start + * + * This is a representation of a PWM waveform alternative to struct pwm_state + * below. It's more expressive than struct pwm_state as it contains a + * duty_offset_ns and so can represent offsets other than zero (with .polarity = + * PWM_POLARITY_NORMAL) and period - duty_cycle (.polarity = + * PWM_POLARITY_INVERSED). + * + * Note there is no explicit bool for enabled. A "disabled" PWM is represented + * by .period_length_ns = 0. Note further that the behaviour of a "disabled" PWM + * is undefined. Depending on the hardware's capabilities it might drive the + * active or inactive level, go high-z or even continue to toggle. + * + * The unit for all three members is nanoseconds. + */ +struct pwm_waveform { + u64 period_length_ns; + u64 duty_length_ns; + u64 duty_offset_ns; +}; + /* * struct pwm_state - state of a PWM channel * @period: PWM period (in nanoseconds) @@ -259,6 +284,17 @@ struct pwm_ops { void (*free)(struct pwm_chip *chip, struct pwm_device *pwm); int (*capture)(struct pwm_chip *chip, struct pwm_device *pwm, struct pwm_capture *result, unsigned long timeout); + + size_t sizeof_wfhw; + int (*round_waveform_tohw)(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_waveform *wf, void *wfhw); + int (*round_waveform_fromhw)(struct pwm_chip *chip, struct pwm_device *pwm, + const void *wfhw, struct pwm_waveform *wf); + int (*read_waveform)(struct pwm_chip *chip, struct pwm_device *pwm, + void *wfhw); + int (*write_waveform)(struct pwm_chip *chip, struct pwm_device *pwm, + const void *wfhw); + int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_state *state); int (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm, -- cgit v1.2.3 From 6c5126c6406d1c31e91f5b925c621c1c785366be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 20 Sep 2024 10:57:59 +0200 Subject: pwm: Provide new consumer API functions for waveforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide API functions for consumers to work with waveforms. Note that one relevant difference between pwm_get_state() and pwm_get_waveform*() is that the latter yields the actually configured hardware state, while the former yields the last state passed to pwm_apply*() and so doesn't account for hardware specific rounding. Signed-off-by: Uwe Kleine-König Tested-by: Trevor Gamblin Link: https://lore.kernel.org/r/6c97d27682853f603e18e9196043886dd671845d.1726819463.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index d8cfe1c9b19d..c3d9ddeafa65 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -358,7 +358,11 @@ static inline void pwmchip_set_drvdata(struct pwm_chip *chip, void *data) } #if IS_ENABLED(CONFIG_PWM) -/* PWM user APIs */ + +/* PWM consumer APIs */ +int pwm_round_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf); +int pwm_get_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf); +int pwm_set_waveform_might_sleep(struct pwm_device *pwm, const struct pwm_waveform *wf, bool exact); int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state); int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state); int pwm_adjust_config(struct pwm_device *pwm); -- cgit v1.2.3 From caf78b0465053c23aa6211b9815dd5433766627d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 24 Sep 2024 12:08:08 +0200 Subject: regcache: Improve documentation of available cache types There is some user confusion about which cache types to choose when which is not helped by the lack of any central documentation providing an overview of what's available. Provide a short overview in the API header to try to help reduce this. Signed-off-by: Mark Brown Link: https://patch.msgid.link/20240924-regcache-document-types-v1-1-e157054e1215@kernel.org Signed-off-by: Mark Brown --- include/linux/regmap.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index f9ccad32fc5c..da07584284ab 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -54,7 +54,14 @@ struct sdw_slave; #define REGMAP_UPSHIFT(s) (-(s)) #define REGMAP_DOWNSHIFT(s) (s) -/* An enum of all the supported cache types */ +/* + * The supported cache types, the default is no cache. Any new caches + * should usually use the maple tree cache unless they specifically + * require that there are never any allocations at runtime and can't + * provide defaults in which case they should use the flat cache. The + * rbtree cache *may* have some performance advantage for very low end + * systems that make heavy use of cache syncs but is mainly legacy. + */ enum regcache_type { REGCACHE_NONE, REGCACHE_RBTREE, -- cgit v1.2.3 From 5441b6975adc26aeaca316b9075e04a98238b1b3 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 25 Sep 2024 17:38:04 +0800 Subject: regulator: Add of_regulator_get_optional() for pure DT regulator lookup The to-be-introduced I2C component prober needs to enable regulator supplies (and toggle GPIO pins) for the various components it intends to probe. To support this, a new "pure DT lookup" method for getting regulator supplies is needed, since the device normally requesting the supply won't get created until after the component is probed to be available. Add a new of_regulator_get_optional() function for this. This mirrors the existing regulator_get_optional() function, but is OF-specific. The underlying code that supports the existing regulator_get*() functions has been reworked in previous patches to support this specific case. Also convert an existing usage of "dev && dev->of_node" to "dev_of_node(dev)". Link: https://lore.kernel.org/all/20231220203537.83479-2-jernej.skrabec@gmail.com/ [1] Signed-off-by: Chen-Yu Tsai Link: https://patch.msgid.link/20240925093807.1026949-2-wenst@chromium.org Reviewed-by: Andy Shevchenko Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index b9ce521910a0..2b22f07e491c 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -168,6 +168,19 @@ int devm_regulator_get_enable_read_voltage(struct device *dev, const char *id); void regulator_put(struct regulator *regulator); void devm_regulator_put(struct regulator *regulator); +#if IS_ENABLED(CONFIG_OF) +struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +#else +static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} +#endif + int regulator_register_supply_alias(struct device *dev, const char *id, struct device *alias_dev, const char *alias_id); @@ -350,6 +363,13 @@ devm_regulator_get_optional(struct device *dev, const char *id) return ERR_PTR(-ENODEV); } +static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + static inline void regulator_put(struct regulator *regulator) { } -- cgit v1.2.3 From 36ec3f437227470568e5f460997f367f5446a34d Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 25 Sep 2024 17:38:05 +0800 Subject: regulator: Add devres version of of_regulator_get_optional() There are existing uses for a devres version of of_regulator_get_optional() in power domain drivers. On MediaTek platforms, power domains may have regulator supplies tied to them. The driver currently tries to use devm_regulator_get() to not have to manage the lifecycle, but ends up doing it in a very hacky way by replacing the device node of the power domain controller device to the device node of the power domain that is currently being registered, getting the supply, and reverting the device node. Provide a better API so that the hack can be replaced. Signed-off-by: Chen-Yu Tsai Link: https://patch.msgid.link/20240925093807.1026949-3-wenst@chromium.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 2b22f07e491c..8c3c372ad735 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -172,6 +172,9 @@ void devm_regulator_put(struct regulator *regulator); struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct device_node *node, const char *id); +struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); #else static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct device_node *node, @@ -179,6 +182,13 @@ static inline struct regulator *__must_check of_regulator_get_optional(struct de { return ERR_PTR(-ENODEV); } + +static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} #endif int regulator_register_supply_alias(struct device *dev, const char *id, @@ -370,6 +380,13 @@ static inline struct regulator *__must_check of_regulator_get_optional(struct de return ERR_PTR(-ENODEV); } +static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + static inline void regulator_put(struct regulator *regulator) { } -- cgit v1.2.3 From 0809a9ccac4a2ffdfd1561bb551aec6099775545 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 2 Sep 2024 20:59:47 +0800 Subject: spi: remove {devm_}spi_alloc_master/slave() All the {devm_}spi_alloc_master/slave() have been replaced, so they can be removed and replaced in doc and comment. No functional changed. Signed-off-by: Yang Yingliang Link: https://patch.msgid.link/20240902125947.1368-8-yangyingliang@huaweicloud.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 4b95663163e0..8497f4747e24 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -824,21 +824,6 @@ void spi_take_timestamp_post(struct spi_controller *ctlr, extern struct spi_controller *__spi_alloc_controller(struct device *host, unsigned int size, bool slave); -static inline struct spi_controller *spi_alloc_master(struct device *host, - unsigned int size) -{ - return __spi_alloc_controller(host, size, false); -} - -static inline struct spi_controller *spi_alloc_slave(struct device *host, - unsigned int size) -{ - if (!IS_ENABLED(CONFIG_SPI_SLAVE)) - return NULL; - - return __spi_alloc_controller(host, size, true); -} - static inline struct spi_controller *spi_alloc_host(struct device *dev, unsigned int size) { @@ -858,21 +843,6 @@ struct spi_controller *__devm_spi_alloc_controller(struct device *dev, unsigned int size, bool slave); -static inline struct spi_controller *devm_spi_alloc_master(struct device *dev, - unsigned int size) -{ - return __devm_spi_alloc_controller(dev, size, false); -} - -static inline struct spi_controller *devm_spi_alloc_slave(struct device *dev, - unsigned int size) -{ - if (!IS_ENABLED(CONFIG_SPI_SLAVE)) - return NULL; - - return __devm_spi_alloc_controller(dev, size, true); -} - static inline struct spi_controller *devm_spi_alloc_host(struct device *dev, unsigned int size) { -- cgit v1.2.3 From 6074e905023d09f64f2c896f475820a5623deb2c Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Mon, 16 Sep 2024 13:00:25 +0200 Subject: firmware: sysfb: Add a sysfb_handles_screen_info() helper function That can be used by drivers to check if the Generic System Framebuffers (sysfb) support can handle the data contained in the global screen_info. Drivers might need this information to know if have to setup the system framebuffer, or if they have to delegate this action to sysfb instead. Suggested-by: Thomas Zimmermann Signed-off-by: Javier Martinez Canillas Reviewed-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20240916110040.1688511-2-javierm@redhat.com Signed-off-by: Tzung-Bi Shih --- include/linux/sysfb.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysfb.h b/include/linux/sysfb.h index bef5f06a91de..07cbab516942 100644 --- a/include/linux/sysfb.h +++ b/include/linux/sysfb.h @@ -60,12 +60,19 @@ struct efifb_dmi_info { void sysfb_disable(struct device *dev); +bool sysfb_handles_screen_info(void); + #else /* CONFIG_SYSFB */ static inline void sysfb_disable(struct device *dev) { } +static inline bool sysfb_handles_screen_info(void) +{ + return false; +} + #endif /* CONFIG_SYSFB */ #ifdef CONFIG_EFI -- cgit v1.2.3 From e4ca0e59c39442546866f3dd514a3a5956577daf Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 3 Sep 2024 20:59:04 +0300 Subject: types: Complement the aligned types with signed 64-bit one Some user may want to use aligned signed 64-bit type. Provide it for them. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20240903180218.3640501-2-andriy.shevchenko@linux.intel.com Signed-off-by: Jonathan Cameron --- include/linux/types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index 2bc8766ba20c..2d7b9ae8714c 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -115,8 +115,9 @@ typedef u64 u_int64_t; typedef s64 int64_t; #endif -/* this is a special 64bit data type that is 8-byte aligned */ +/* These are the special 64-bit data types that are 8-byte aligned */ #define aligned_u64 __aligned_u64 +#define aligned_s64 __aligned_s64 #define aligned_be64 __aligned_be64 #define aligned_le64 __aligned_le64 -- cgit v1.2.3 From faf178607772f28006e403d7bab6c4217d4ee447 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 7 Sep 2024 19:24:46 +0200 Subject: iio: adc: Constify struct iio_map 'struct iio_map' are not modified in these drivers. Constifying this structure moves some data to a read-only section, so increase overall security. In order to do it, the prototype of iio_map_array_register() and devm_iio_map_array_register(), and a few structures that hold a "struct iio_map *" need to be adjusted. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 21086 760 0 21846 5556 drivers/iio/adc/axp20x_adc.o After: ===== text data bss dec hex filename 21470 360 0 21830 5546 drivers/iio/adc/axp20x_adc.o 33842 1697 384 35923 8c53 drivers/iio/addac/ad74413r.o -- Compile tested only Signed-off-by: Christophe JAILLET Link: https://patch.msgid.link/5729dc3cc3892ecf0d8ea28c5f7307b34e27493e.1725729801.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jonathan Cameron --- include/linux/iio/driver.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/driver.h b/include/linux/iio/driver.h index 7a157ed218f6..7f8b55551ed0 100644 --- a/include/linux/iio/driver.h +++ b/include/linux/iio/driver.h @@ -18,7 +18,7 @@ struct iio_map; * @map: array of mappings specifying association of channel with client */ int iio_map_array_register(struct iio_dev *indio_dev, - struct iio_map *map); + const struct iio_map *map); /** * iio_map_array_unregister() - tell the core to remove consumer mappings for @@ -38,6 +38,7 @@ int iio_map_array_unregister(struct iio_dev *indio_dev); * handle de-registration of the IIO map object when the device's refcount goes to * zero. */ -int devm_iio_map_array_register(struct device *dev, struct iio_dev *indio_dev, struct iio_map *maps); +int devm_iio_map_array_register(struct device *dev, struct iio_dev *indio_dev, + const struct iio_map *maps); #endif -- cgit v1.2.3 From 0f702757c68b9790a844e5c07073d3d1c777b13a Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 26 Aug 2024 11:34:11 +0800 Subject: ARM: samsung: Remove obsoleted declaration for s3c_hwmon_set_platdata The s3c_hwmon_set_platdata() have been removed since commit 0d297df03890 ("ARM: s3c: simplify platform code"), and now it is useless, so remove it. Signed-off-by: Gaosheng Cui Link: https://lore.kernel.org/r/20240826033411.4022822-1-cuigaosheng1@huawei.com Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/hwmon-s3c.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/hwmon-s3c.h b/include/linux/platform_data/hwmon-s3c.h index 1707ad4147df..7d21e0c41037 100644 --- a/include/linux/platform_data/hwmon-s3c.h +++ b/include/linux/platform_data/hwmon-s3c.h @@ -33,14 +33,4 @@ struct s3c_hwmon_pdata { struct s3c_hwmon_chcfg *in[8]; }; -/** - * s3c_hwmon_set_platdata - Set platform data for S3C HWMON device - * @pd: Platform data to register to device. - * - * Register the given platform data for use with the S3C HWMON device. - * The call will copy the platform data, so the board definitions can - * make the structure itself __initdata. - */ -extern void __init s3c_hwmon_set_platdata(struct s3c_hwmon_pdata *pd); - #endif /* __HWMON_S3C_H__ */ -- cgit v1.2.3 From 3a6ad95d97eb62a7b7c804ef7eeb329a1f697d00 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 26 Aug 2024 11:31:18 +0800 Subject: ASoC: samsung: Remove obsoleted declaration for s3c64xx_ac97_setup_gpio The s3c64xx_ac97_setup_gpio() have been removed since commit 0d297df03890 ("ARM: s3c: simplify platform code"), and now it is useless, so remove it. Signed-off-by: Gaosheng Cui Link: https://lore.kernel.org/r/20240826033118.4021727-1-cuigaosheng1@huawei.com Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/asoc-s3c.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/asoc-s3c.h b/include/linux/platform_data/asoc-s3c.h index f9c00f839e9f..085dd8e8af76 100644 --- a/include/linux/platform_data/asoc-s3c.h +++ b/include/linux/platform_data/asoc-s3c.h @@ -13,8 +13,6 @@ #include -extern void s3c64xx_ac97_setup_gpio(int); - struct samsung_i2s_type { /* If the Primary DAI has 5.1 Channels */ #define QUIRK_PRI_6CHAN (1 << 0) -- cgit v1.2.3 From a370b72ec7165ebe1230d0225cbe66f6526e68ef Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 18 Aug 2024 21:48:13 +0900 Subject: tracing: Add a comment about ftrace_regs definition To clarify what will be expected on ftrace_regs, add a comment to the architecture independent definition of the ftrace_regs. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fd5e84d0ec47..42106b3de396 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -117,6 +117,32 @@ extern int ftrace_enabled; #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +/** + * ftrace_regs - ftrace partial/optimal register set + * + * ftrace_regs represents a group of registers which is used at the + * function entry and exit. There are three types of registers. + * + * - Registers for passing the parameters to callee, including the stack + * pointer. (e.g. rcx, rdx, rdi, rsi, r8, r9 and rsp on x86_64) + * - Registers for passing the return values to caller. + * (e.g. rax and rdx on x86_64) + * - Registers for hooking the function call and return including the + * frame pointer (the frame pointer is architecture/config dependent) + * (e.g. rip, rbp and rsp for x86_64) + * + * Also, architecture dependent fields can be used for internal process. + * (e.g. orig_ax on x86_64) + * + * On the function entry, those registers will be restored except for + * the stack pointer, so that user can change the function parameters + * and instruction pointer (e.g. live patching.) + * On the function exit, only registers which is used for return values + * are restored. + * + * NOTE: user *must not* access regs directly, only do it via APIs, because + * the member can be changed according to the architecture. + */ struct ftrace_regs { struct pt_regs regs; }; -- cgit v1.2.3 From a312a0f7834e605e7c41570f0e9525d0fc4a70a4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Sep 2024 17:48:06 -0400 Subject: fgraph: Use fgraph data to store subtime for profiler Instead of having the "subtime" for the function profiler in the infrastructure ftrace_ret_stack structure, have it use the fgraph data reserve and retrieve functions. This will keep the limited shadow stack from wasting 8 bytes for something that is seldom used. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Jiri Olsa Link: https://lore.kernel.org/20240914214826.780323141@goodmis.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 42106b3de396..aabd348cad4a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1081,6 +1081,7 @@ struct fgraph_ops { void *fgraph_reserve_data(int idx, int size_bytes); void *fgraph_retrieve_data(int idx, int *size_bytes); +void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth); /* * Stack of return addresses for functions @@ -1091,9 +1092,6 @@ struct ftrace_ret_stack { unsigned long ret; unsigned long func; unsigned long long calltime; -#ifdef CONFIG_FUNCTION_PROFILER - unsigned long long subtime; -#endif #ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; #endif -- cgit v1.2.3 From 3c9880f3ab52b52b5b4e1850a70e80dd7329cb4c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Sep 2024 17:48:07 -0400 Subject: ftrace: Use a running sleeptime instead of saving on shadow stack The fgraph "sleep-time" option tells the function graph tracer and the profiler whether to include the time a function "sleeps" (is scheduled off the CPU) in its duration for the function. By default it is true, which means the duration of a function is calculated by the timestamp of when the function was entered to the timestamp of when it exits. If the "sleep-time" option is disabled, it needs to remove the time that the task was not running on the CPU during the function. Currently it is done in a sched_switch tracepoint probe where it moves the "calltime" (time of entry of the function) forward by the sleep time calculated. It updates all the calltime in the shadow stack. This is time consuming for those users of the function graph tracer that does not care about the sleep time. Instead, add a "ftrace_sleeptime" to the task_struct that gets the sleep time added each time the task wakes up. Then have the function entry save the current "ftrace_sleeptime" and on function exit, move the calltime forward by the difference of the current "ftrace_sleeptime" from the saved sleeptime. This removes one dependency of "calltime" needed to be on the shadow stack. It also simplifies the code that removes the sleep time of functions. TODO: Only enable the sched_switch tracepoint when this is needed. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Jiri Olsa Link: https://lore.kernel.org/20240914214826.938908568@goodmis.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..c08f3bdb11a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1441,6 +1441,7 @@ struct task_struct { /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; + unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced -- cgit v1.2.3 From f1f36e22bee967db5e812a65e24389e54c46f3c2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Sep 2024 17:48:08 -0400 Subject: ftrace: Have calltime be saved in the fgraph storage The calltime field in the shadow stack frame is only used by the function graph tracer and profiler. But now that there's other users of the function graph infrastructure, this adds overhead and wastes space on the shadow stack. Move the calltime to the fgraph data storage, where the function graph and profiler entry functions will save it in its own graph storage and retrieve it in its exit functions. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Jiri Olsa Link: https://lore.kernel.org/20240914214827.096968730@goodmis.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index aabd348cad4a..e684addf6508 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1091,7 +1091,6 @@ void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth); struct ftrace_ret_stack { unsigned long ret; unsigned long func; - unsigned long long calltime; #ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; #endif -- cgit v1.2.3 From 26228256b796eb0145bdfb2ae34ec8c4c0ef1319 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 6 Sep 2024 09:52:16 +0200 Subject: backlight: lcd: Test against struct fb_info.lcd_dev Add struct fb_info.lcd_dev for fbdev drivers to store a reference to their lcd device. Update the lcd's fb_notifier_callback() to test for this field. The lcd module can now detect if an lcd device belongs to an fbdev device. This works similar to the bl_dev for backlights and will allow for the removal of the check_fb callback from several fbdev driver's lcd devices. Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20240906075439.98476-3-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/fb.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 267b59ead432..5ba187e08cf7 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -21,6 +21,7 @@ struct fb_info; struct file; struct i2c_adapter; struct inode; +struct lcd_device; struct module; struct notifier_block; struct page; @@ -480,6 +481,13 @@ struct fb_info { struct mutex bl_curve_mutex; u8 bl_curve[FB_BACKLIGHT_LEVELS]; #endif + + /* + * Assigned LCD device; set before framebuffer + * registration, remove after unregister + */ + struct lcd_device *lcd_dev; + #ifdef CONFIG_FB_DEFERRED_IO struct delayed_work deferred_work; unsigned long npagerefs; @@ -754,6 +762,11 @@ static inline struct backlight_device *fb_bl_device(struct fb_info *info) } #endif +static inline struct lcd_device *fb_lcd_device(struct fb_info *info) +{ + return info->lcd_dev; +} + /* fbmon.c */ #define FB_MAXTIMINGS 0 #define FB_VSYNCTIMINGS 1 -- cgit v1.2.3 From 48ffe2074c2864ab64ee2004e7ebf3d6a6730fbf Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 6 Sep 2024 09:52:17 +0200 Subject: backlight: lcd: Add LCD_POWER_ constants for power states Duplicate FB_BLANK_ constants as LCD_POWER_ constants in the lcd header file. Allows lcd drivers to avoid including the fbdev header file and removes a compile-time dependency between the two subsystems. The new LCD_POWER_ constants have the same values as their FB_BLANK_ counterparts. Hence semantics does not change and the lcd drivers can be converted one by one. Each instance of FB_BLANK_UNBLANK becomes LCD_POWER_ON, each of FB_BLANK_POWERDOWN becomes LCD_POWER_OFF, FB_BLANK_NORMAL becomes LCD_POWER_REDUCED and FB_BLANK_VSYNC_SUSPEND becomes LCD_POWER_REDUCED_VSYNC_SUSPEND. Lcd code or drivers do not use FB_BLANK_HSYNC_SUSPEND, so no new constants for this is being added. The tokens LCD_POWER_REDUCED and LCD_POWER_REDUCED_VSYNC_SUSPEND are deprecated and drivers should replace them with LCD_POWER_ON and LCD_POWER_OFF. See also commit a1cacb8a8e70 ("backlight: Add BACKLIGHT_POWER_ constants for power states"), which added similar constants for backlight drivers. v2: - fix typo in commit description Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20240906075439.98476-4-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/lcd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lcd.h b/include/linux/lcd.h index 68703a51dc53..dfcc54d327f5 100644 --- a/include/linux/lcd.h +++ b/include/linux/lcd.h @@ -14,6 +14,11 @@ #include #include +#define LCD_POWER_ON (0) +#define LCD_POWER_REDUCED (1) // deprecated; don't use in new code +#define LCD_POWER_REDUCED_VSYNC_SUSPEND (2) // deprecated; don't use in new code +#define LCD_POWER_OFF (4) + /* Notes on locking: * * lcd_device->ops_lock is an internal backlight lock protecting the ops -- cgit v1.2.3 From 43e1120deb3768c86aa3875c7073658e44a30ea5 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 6 Sep 2024 09:52:40 +0200 Subject: backlight: lcd: Replace check_fb with controls_device Rename check_fb in struct lcd_ops to controls_device. The callback is now independent from fbdev's struct fb_info and tests if an lcd device controls a hardware display device. The new naming and semantics follow similar functionality for backlight devices. v2: - fix typos in commit description (Daniel) Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20240906075439.98476-27-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/lcd.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lcd.h b/include/linux/lcd.h index dfcc54d327f5..8399b5ed48f2 100644 --- a/include/linux/lcd.h +++ b/include/linux/lcd.h @@ -35,7 +35,6 @@ */ struct lcd_device; -struct fb_info; struct lcd_properties { /* The maximum value for contrast (read-only) */ @@ -54,9 +53,18 @@ struct lcd_ops { int (*set_contrast)(struct lcd_device *, int contrast); /* Set LCD panel mode (resolutions ...) */ int (*set_mode)(struct lcd_device *, struct fb_videomode *); - /* Check if given framebuffer device is the one LCD is bound to; - return 0 if not, !=0 if it is. If NULL, lcd always matches the fb. */ - int (*check_fb)(struct lcd_device *, struct fb_info *); + + /* + * Check if the LCD controls the given display device. This + * operation is optional and if not implemented it is assumed that + * the display is always the one controlled by the LCD. + * + * RETURNS: + * + * If display_dev is NULL or display_dev matches the device controlled by + * the LCD, return true. Otherwise return false. + */ + bool (*controls_device)(struct lcd_device *lcd, struct device *display_device); }; struct lcd_device { -- cgit v1.2.3 From 02e224d096ef58fe59e96609de6018e133f33512 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 6 Sep 2024 09:52:41 +0200 Subject: backlight: lcd: Remove struct fb_videomode from set_mode callback Implementations of struct lcd_ops.set_mode only require the resolution from struct fb_videomode. Pass the xres and yres fields, but remove the dependency on the fbdev data structure. Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20240906075439.98476-28-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/lcd.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lcd.h b/include/linux/lcd.h index 8399b5ed48f2..59a80b396a71 100644 --- a/include/linux/lcd.h +++ b/include/linux/lcd.h @@ -51,8 +51,11 @@ struct lcd_ops { int (*get_contrast)(struct lcd_device *); /* Set LCD panel contrast */ int (*set_contrast)(struct lcd_device *, int contrast); - /* Set LCD panel mode (resolutions ...) */ - int (*set_mode)(struct lcd_device *, struct fb_videomode *); + + /* + * Set LCD panel mode (resolutions ...) + */ + int (*set_mode)(struct lcd_device *lcd, u32 xres, u32 yres); /* * Check if the LCD controls the given display device. This -- cgit v1.2.3 From 0d580d99749e759b62dc8e28f511310e9235da7a Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 6 Sep 2024 09:52:42 +0200 Subject: backlight: lcd: Do not include in lcd header With the exception of fb_notifier_callback(), none of the lcd code uses fbdev; especially not the lcd drivers. Remove the include statement for from the public lcd header. v2: - fix typos in commit description Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Thompson Link: https://lore.kernel.org/r/20240906075439.98476-29-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/lcd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lcd.h b/include/linux/lcd.h index 59a80b396a71..c3ccdff4519a 100644 --- a/include/linux/lcd.h +++ b/include/linux/lcd.h @@ -12,7 +12,6 @@ #include #include #include -#include #define LCD_POWER_ON (0) #define LCD_POWER_REDUCED (1) // deprecated; don't use in new code -- cgit v1.2.3 From 86b9ce0a8a6cf9397503920cd5412d207a93fae9 Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Fri, 6 Sep 2024 16:31:12 +0530 Subject: firmware: xilinx: Add Pinctrl Get Attribute ID Add Pinctrl Get Attribute ID to the query ids list. Signed-off-by: Sai Krishna Potthuri Link: https://lore.kernel.org/20240906110113.3154327-3-sai.krishna.potthuri@amd.com Signed-off-by: Linus Walleij --- include/linux/firmware/xlnx-zynqmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index d7d07afc0532..3b4ce4ec5d3f 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -238,6 +238,7 @@ enum pm_query_id { PM_QID_PINCTRL_GET_PIN_GROUPS = 11, PM_QID_CLOCK_GET_NUM_CLOCKS = 12, PM_QID_CLOCK_GET_MAX_DIVISOR = 13, + PM_QID_PINCTRL_GET_ATTRIBUTES = 15, }; enum rpu_oper_mode { -- cgit v1.2.3 From b875bd5b381e114115922944f7a01e31f8b07c2a Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 11 Sep 2024 15:43:00 -0400 Subject: exportfs: Remove EXPORT_OP_ASYNC_LOCK Now that GFS2 and OCFS2 are signalling async ->lock() support with FOP_ASYNC_LOCK and checks for support are converted, we can remove EXPORT_OP_ASYNC_LOCK. Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/0a114db814fec3086f937ae3d44a086f13b8de26.1726083391.git.bcodding@redhat.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 893a1d21dc1c..1ab165c2939f 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -250,19 +250,6 @@ struct export_operations { unsigned long flags; }; -/** - * exportfs_lock_op_is_async() - export op supports async lock operation - * @export_ops: the nfs export operations to check - * - * Returns true if the nfs export_operations structure has - * EXPORT_OP_ASYNC_LOCK in their flags set - */ -static inline bool -exportfs_lock_op_is_async(const struct export_operations *export_ops) -{ - return export_ops->flags & EXPORT_OP_ASYNC_LOCK; -} - extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags); -- cgit v1.2.3 From dad35f7d2fc14e446669d4cab100597a6798eae5 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Wed, 25 Sep 2024 18:40:09 +0200 Subject: reset: replace boolean parameters with flags parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce enum reset_control_flags and replace the list of boolean parameters to the internal reset_control_get functions with a single flags parameter, before adding more boolean options. The separate boolean parameters have been shown to be error prone in the past. See for example commit a57f68ddc886 ("reset: Fix devm bulk optional exclusive control getter"). Acked-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240925-reset-get-deasserted-v2-1-b3601bbd0458@pengutronix.de Signed-off-by: Philipp Zabel --- include/linux/reset.h | 161 +++++++++++++++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 514ddf003efc..99296af98f81 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -25,6 +25,33 @@ struct reset_control_bulk_data { struct reset_control *rstc; }; +#define RESET_CONTROL_FLAGS_BIT_SHARED BIT(0) /* not exclusive */ +#define RESET_CONTROL_FLAGS_BIT_OPTIONAL BIT(1) +#define RESET_CONTROL_FLAGS_BIT_ACQUIRED BIT(2) /* iff exclusive, not released */ + +/** + * enum reset_control_flags - Flags that can be passed to the reset_control_get functions + * to determine the type of reset control. + * These values cannot be OR'd. + * + * @RESET_CONTROL_EXCLUSIVE: exclusive, acquired, + * @RESET_CONTROL_EXCLUSIVE_RELEASED: exclusive, released, + * @RESET_CONTROL_SHARED: shared + * @RESET_CONTROL_OPTIONAL_EXCLUSIVE: optional, exclusive, acquired + * @RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED: optional, exclusive, released + * @RESET_CONTROL_OPTIONAL_SHARED: optional, shared + */ +enum reset_control_flags { + RESET_CONTROL_EXCLUSIVE = RESET_CONTROL_FLAGS_BIT_ACQUIRED, + RESET_CONTROL_EXCLUSIVE_RELEASED = 0, + RESET_CONTROL_SHARED = RESET_CONTROL_FLAGS_BIT_SHARED, + RESET_CONTROL_OPTIONAL_EXCLUSIVE = RESET_CONTROL_FLAGS_BIT_OPTIONAL | + RESET_CONTROL_FLAGS_BIT_ACQUIRED, + RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED = RESET_CONTROL_FLAGS_BIT_OPTIONAL, + RESET_CONTROL_OPTIONAL_SHARED = RESET_CONTROL_FLAGS_BIT_OPTIONAL | + RESET_CONTROL_FLAGS_BIT_SHARED, +}; + #ifdef CONFIG_RESET_CONTROLLER int reset_control_reset(struct reset_control *rstc); @@ -42,30 +69,25 @@ int reset_control_bulk_acquire(int num_rstcs, struct reset_control_bulk_data *rs void reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs); struct reset_control *__of_reset_control_get(struct device_node *node, - const char *id, int index, bool shared, - bool optional, bool acquired); + const char *id, int index, enum reset_control_flags flags); struct reset_control *__reset_control_get(struct device *dev, const char *id, - int index, bool shared, - bool optional, bool acquired); + int index, enum reset_control_flags flags); void reset_control_put(struct reset_control *rstc); int __reset_control_bulk_get(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs, - bool shared, bool optional, bool acquired); + enum reset_control_flags flags); void reset_control_bulk_put(int num_rstcs, struct reset_control_bulk_data *rstcs); int __device_reset(struct device *dev, bool optional); struct reset_control *__devm_reset_control_get(struct device *dev, - const char *id, int index, bool shared, - bool optional, bool acquired); + const char *id, int index, enum reset_control_flags flags); int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs, - bool shared, bool optional, bool acquired); + enum reset_control_flags flags); struct reset_control *devm_reset_control_array_get(struct device *dev, - bool shared, bool optional); -struct reset_control *of_reset_control_array_get(struct device_node *np, - bool shared, bool optional, - bool acquired); + enum reset_control_flags flags); +struct reset_control *of_reset_control_array_get(struct device_node *np, enum reset_control_flags); int reset_control_get_count(struct device *dev); @@ -116,17 +138,19 @@ static inline int __device_reset(struct device *dev, bool optional) static inline struct reset_control *__of_reset_control_get( struct device_node *node, - const char *id, int index, bool shared, - bool optional, bool acquired) + const char *id, int index, enum reset_control_flags flags) { + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + return optional ? NULL : ERR_PTR(-ENOTSUPP); } static inline struct reset_control *__reset_control_get( struct device *dev, const char *id, - int index, bool shared, bool optional, - bool acquired) + int index, enum reset_control_flags flags) { + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + return optional ? NULL : ERR_PTR(-ENOTSUPP); } @@ -162,8 +186,10 @@ reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs) static inline int __reset_control_bulk_get(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs, - bool shared, bool optional, bool acquired) + enum reset_control_flags flags) { + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + return optional ? 0 : -EOPNOTSUPP; } @@ -174,30 +200,36 @@ reset_control_bulk_put(int num_rstcs, struct reset_control_bulk_data *rstcs) static inline struct reset_control *__devm_reset_control_get( struct device *dev, const char *id, - int index, bool shared, bool optional, - bool acquired) + int index, enum reset_control_flags flags) { + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + return optional ? NULL : ERR_PTR(-ENOTSUPP); } static inline int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs, - bool shared, bool optional, bool acquired) + enum reset_control_flags flags) { + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + return optional ? 0 : -EOPNOTSUPP; } static inline struct reset_control * -devm_reset_control_array_get(struct device *dev, bool shared, bool optional) +devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags) { + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + return optional ? NULL : ERR_PTR(-ENOTSUPP); } static inline struct reset_control * -of_reset_control_array_get(struct device_node *np, bool shared, bool optional, - bool acquired) +of_reset_control_array_get(struct device_node *np, enum reset_control_flags flags) { + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + return optional ? NULL : ERR_PTR(-ENOTSUPP); } @@ -236,7 +268,7 @@ static inline int device_reset_optional(struct device *dev) static inline struct reset_control * __must_check reset_control_get_exclusive(struct device *dev, const char *id) { - return __reset_control_get(dev, id, 0, false, false, true); + return __reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE); } /** @@ -253,7 +285,7 @@ static inline int __must_check reset_control_bulk_get_exclusive(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, true); + return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_EXCLUSIVE); } /** @@ -274,7 +306,7 @@ static inline struct reset_control * __must_check reset_control_get_exclusive_released(struct device *dev, const char *id) { - return __reset_control_get(dev, id, 0, false, false, false); + return __reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE_RELEASED); } /** @@ -295,7 +327,7 @@ static inline int __must_check reset_control_bulk_get_exclusive_released(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, false); + return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_EXCLUSIVE_RELEASED); } /** @@ -316,7 +348,8 @@ static inline int __must_check reset_control_bulk_get_optional_exclusive_released(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, false); + return __reset_control_bulk_get(dev, num_rstcs, rstcs, + RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED); } /** @@ -344,7 +377,7 @@ reset_control_bulk_get_optional_exclusive_released(struct device *dev, int num_r static inline struct reset_control *reset_control_get_shared( struct device *dev, const char *id) { - return __reset_control_get(dev, id, 0, true, false, false); + return __reset_control_get(dev, id, 0, RESET_CONTROL_SHARED); } /** @@ -361,7 +394,7 @@ static inline int __must_check reset_control_bulk_get_shared(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, false); + return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_SHARED); } /** @@ -378,7 +411,7 @@ reset_control_bulk_get_shared(struct device *dev, int num_rstcs, static inline struct reset_control *reset_control_get_optional_exclusive( struct device *dev, const char *id) { - return __reset_control_get(dev, id, 0, false, true, true); + return __reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE); } /** @@ -398,7 +431,7 @@ static inline int __must_check reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, true); + return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_OPTIONAL_EXCLUSIVE); } /** @@ -415,7 +448,7 @@ reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs, static inline struct reset_control *reset_control_get_optional_shared( struct device *dev, const char *id) { - return __reset_control_get(dev, id, 0, true, true, false); + return __reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_SHARED); } /** @@ -435,7 +468,7 @@ static inline int __must_check reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __reset_control_bulk_get(dev, num_rstcs, rstcs, true, true, false); + return __reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_OPTIONAL_SHARED); } /** @@ -451,7 +484,7 @@ reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs, static inline struct reset_control *of_reset_control_get_exclusive( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, false, false, true); + return __of_reset_control_get(node, id, 0, RESET_CONTROL_EXCLUSIVE); } /** @@ -471,7 +504,7 @@ static inline struct reset_control *of_reset_control_get_exclusive( static inline struct reset_control *of_reset_control_get_optional_exclusive( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, false, true, true); + return __of_reset_control_get(node, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE); } /** @@ -496,7 +529,7 @@ static inline struct reset_control *of_reset_control_get_optional_exclusive( static inline struct reset_control *of_reset_control_get_shared( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, true, false, false); + return __of_reset_control_get(node, id, 0, RESET_CONTROL_SHARED); } /** @@ -513,7 +546,7 @@ static inline struct reset_control *of_reset_control_get_shared( static inline struct reset_control *of_reset_control_get_exclusive_by_index( struct device_node *node, int index) { - return __of_reset_control_get(node, NULL, index, false, false, true); + return __of_reset_control_get(node, NULL, index, RESET_CONTROL_EXCLUSIVE); } /** @@ -541,7 +574,7 @@ static inline struct reset_control *of_reset_control_get_exclusive_by_index( static inline struct reset_control *of_reset_control_get_shared_by_index( struct device_node *node, int index) { - return __of_reset_control_get(node, NULL, index, true, false, false); + return __of_reset_control_get(node, NULL, index, RESET_CONTROL_SHARED); } /** @@ -560,7 +593,7 @@ static inline struct reset_control * __must_check devm_reset_control_get_exclusive(struct device *dev, const char *id) { - return __devm_reset_control_get(dev, id, 0, false, false, true); + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE); } /** @@ -580,7 +613,8 @@ static inline int __must_check devm_reset_control_bulk_get_exclusive(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, true); + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, + RESET_CONTROL_EXCLUSIVE); } /** @@ -599,7 +633,7 @@ static inline struct reset_control * __must_check devm_reset_control_get_exclusive_released(struct device *dev, const char *id) { - return __devm_reset_control_get(dev, id, 0, false, false, false); + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE_RELEASED); } /** @@ -619,7 +653,8 @@ static inline int __must_check devm_reset_control_bulk_get_exclusive_released(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, false); + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, + RESET_CONTROL_EXCLUSIVE_RELEASED); } /** @@ -638,7 +673,7 @@ static inline struct reset_control * __must_check devm_reset_control_get_optional_exclusive_released(struct device *dev, const char *id) { - return __devm_reset_control_get(dev, id, 0, false, true, false); + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED); } /** @@ -658,7 +693,8 @@ static inline int __must_check devm_reset_control_bulk_get_optional_exclusive_released(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, false); + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, + RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED); } /** @@ -673,7 +709,7 @@ devm_reset_control_bulk_get_optional_exclusive_released(struct device *dev, int static inline struct reset_control *devm_reset_control_get_shared( struct device *dev, const char *id) { - return __devm_reset_control_get(dev, id, 0, true, false, false); + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_SHARED); } /** @@ -693,7 +729,7 @@ static inline int __must_check devm_reset_control_bulk_get_shared(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, false); + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_SHARED); } /** @@ -711,7 +747,7 @@ devm_reset_control_bulk_get_shared(struct device *dev, int num_rstcs, static inline struct reset_control *devm_reset_control_get_optional_exclusive( struct device *dev, const char *id) { - return __devm_reset_control_get(dev, id, 0, false, true, true); + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE); } /** @@ -731,7 +767,8 @@ static inline int __must_check devm_reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, true); + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, + RESET_CONTROL_OPTIONAL_EXCLUSIVE); } /** @@ -749,7 +786,7 @@ devm_reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs static inline struct reset_control *devm_reset_control_get_optional_shared( struct device *dev, const char *id) { - return __devm_reset_control_get(dev, id, 0, true, true, false); + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_SHARED); } /** @@ -769,7 +806,7 @@ static inline int __must_check devm_reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, true, false); + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_OPTIONAL_SHARED); } /** @@ -787,7 +824,7 @@ devm_reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs, static inline struct reset_control * devm_reset_control_get_exclusive_by_index(struct device *dev, int index) { - return __devm_reset_control_get(dev, NULL, index, false, false, true); + return __devm_reset_control_get(dev, NULL, index, RESET_CONTROL_EXCLUSIVE); } /** @@ -803,7 +840,7 @@ devm_reset_control_get_exclusive_by_index(struct device *dev, int index) static inline struct reset_control * devm_reset_control_get_shared_by_index(struct device *dev, int index) { - return __devm_reset_control_get(dev, NULL, index, true, false, false); + return __devm_reset_control_get(dev, NULL, index, RESET_CONTROL_SHARED); } /* @@ -851,54 +888,54 @@ static inline struct reset_control *devm_reset_control_get_by_index( static inline struct reset_control * devm_reset_control_array_get_exclusive(struct device *dev) { - return devm_reset_control_array_get(dev, false, false); + return devm_reset_control_array_get(dev, RESET_CONTROL_EXCLUSIVE); } static inline struct reset_control * devm_reset_control_array_get_shared(struct device *dev) { - return devm_reset_control_array_get(dev, true, false); + return devm_reset_control_array_get(dev, RESET_CONTROL_SHARED); } static inline struct reset_control * devm_reset_control_array_get_optional_exclusive(struct device *dev) { - return devm_reset_control_array_get(dev, false, true); + return devm_reset_control_array_get(dev, RESET_CONTROL_OPTIONAL_EXCLUSIVE); } static inline struct reset_control * devm_reset_control_array_get_optional_shared(struct device *dev) { - return devm_reset_control_array_get(dev, true, true); + return devm_reset_control_array_get(dev, RESET_CONTROL_OPTIONAL_SHARED); } static inline struct reset_control * of_reset_control_array_get_exclusive(struct device_node *node) { - return of_reset_control_array_get(node, false, false, true); + return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE); } static inline struct reset_control * of_reset_control_array_get_exclusive_released(struct device_node *node) { - return of_reset_control_array_get(node, false, false, false); + return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE_RELEASED); } static inline struct reset_control * of_reset_control_array_get_shared(struct device_node *node) { - return of_reset_control_array_get(node, true, false, true); + return of_reset_control_array_get(node, RESET_CONTROL_SHARED); } static inline struct reset_control * of_reset_control_array_get_optional_exclusive(struct device_node *node) { - return of_reset_control_array_get(node, false, true, true); + return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_EXCLUSIVE); } static inline struct reset_control * of_reset_control_array_get_optional_shared(struct device_node *node) { - return of_reset_control_array_get(node, true, true, true); + return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_SHARED); } #endif -- cgit v1.2.3 From d872bed85036f5e60c66b0dd0994346b4ea6470c Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Wed, 25 Sep 2024 18:40:10 +0200 Subject: reset: Add devres helpers to request pre-deasserted reset controls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add devres helpers - devm_reset_control_bulk_get_exclusive_deasserted - devm_reset_control_bulk_get_optional_exclusive_deasserted - devm_reset_control_bulk_get_optional_shared_deasserted - devm_reset_control_bulk_get_shared_deasserted - devm_reset_control_get_exclusive_deasserted - devm_reset_control_get_optional_exclusive_deasserted - devm_reset_control_get_optional_shared_deasserted - devm_reset_control_get_shared_deasserted to request and immediately deassert reset controls. During cleanup, reset_control_assert() will be called automatically on the returned reset controls. Acked-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240925-reset-get-deasserted-v2-2-b3601bbd0458@pengutronix.de Signed-off-by: Philipp Zabel --- include/linux/reset.h | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 99296af98f81..2986ced69a02 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -28,6 +28,7 @@ struct reset_control_bulk_data { #define RESET_CONTROL_FLAGS_BIT_SHARED BIT(0) /* not exclusive */ #define RESET_CONTROL_FLAGS_BIT_OPTIONAL BIT(1) #define RESET_CONTROL_FLAGS_BIT_ACQUIRED BIT(2) /* iff exclusive, not released */ +#define RESET_CONTROL_FLAGS_BIT_DEASSERTED BIT(3) /** * enum reset_control_flags - Flags that can be passed to the reset_control_get functions @@ -35,21 +36,35 @@ struct reset_control_bulk_data { * These values cannot be OR'd. * * @RESET_CONTROL_EXCLUSIVE: exclusive, acquired, + * @RESET_CONTROL_EXCLUSIVE_DEASSERTED: exclusive, acquired, deasserted * @RESET_CONTROL_EXCLUSIVE_RELEASED: exclusive, released, * @RESET_CONTROL_SHARED: shared + * @RESET_CONTROL_SHARED_DEASSERTED: shared, deasserted * @RESET_CONTROL_OPTIONAL_EXCLUSIVE: optional, exclusive, acquired + * @RESET_CONTROL_OPTIONAL_EXCLUSIVE_DEASSERTED: optional, exclusive, acquired, deasserted * @RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED: optional, exclusive, released * @RESET_CONTROL_OPTIONAL_SHARED: optional, shared + * @RESET_CONTROL_OPTIONAL_SHARED_DEASSERTED: optional, shared, deasserted */ enum reset_control_flags { RESET_CONTROL_EXCLUSIVE = RESET_CONTROL_FLAGS_BIT_ACQUIRED, + RESET_CONTROL_EXCLUSIVE_DEASSERTED = RESET_CONTROL_FLAGS_BIT_ACQUIRED | + RESET_CONTROL_FLAGS_BIT_DEASSERTED, RESET_CONTROL_EXCLUSIVE_RELEASED = 0, RESET_CONTROL_SHARED = RESET_CONTROL_FLAGS_BIT_SHARED, + RESET_CONTROL_SHARED_DEASSERTED = RESET_CONTROL_FLAGS_BIT_SHARED | + RESET_CONTROL_FLAGS_BIT_DEASSERTED, RESET_CONTROL_OPTIONAL_EXCLUSIVE = RESET_CONTROL_FLAGS_BIT_OPTIONAL | RESET_CONTROL_FLAGS_BIT_ACQUIRED, + RESET_CONTROL_OPTIONAL_EXCLUSIVE_DEASSERTED = RESET_CONTROL_FLAGS_BIT_OPTIONAL | + RESET_CONTROL_FLAGS_BIT_ACQUIRED | + RESET_CONTROL_FLAGS_BIT_DEASSERTED, RESET_CONTROL_OPTIONAL_EXCLUSIVE_RELEASED = RESET_CONTROL_FLAGS_BIT_OPTIONAL, RESET_CONTROL_OPTIONAL_SHARED = RESET_CONTROL_FLAGS_BIT_OPTIONAL | RESET_CONTROL_FLAGS_BIT_SHARED, + RESET_CONTROL_OPTIONAL_SHARED_DEASSERTED = RESET_CONTROL_FLAGS_BIT_OPTIONAL | + RESET_CONTROL_FLAGS_BIT_SHARED | + RESET_CONTROL_FLAGS_BIT_DEASSERTED, }; #ifdef CONFIG_RESET_CONTROLLER @@ -596,6 +611,25 @@ __must_check devm_reset_control_get_exclusive(struct device *dev, return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE); } +/** + * devm_reset_control_get_exclusive_deasserted - resource managed + * reset_control_get_exclusive() + + * reset_control_deassert() + * @dev: device to be reset by the controller + * @id: reset line name + * + * Managed reset_control_get_exclusive() + reset_control_deassert(). For reset + * controllers returned from this function, reset_control_assert() + + * reset_control_put() is called automatically on driver detach. + * + * See reset_control_get_exclusive() for more information. + */ +static inline struct reset_control * __must_check +devm_reset_control_get_exclusive_deasserted(struct device *dev, const char *id) +{ + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_EXCLUSIVE_DEASSERTED); +} + /** * devm_reset_control_bulk_get_exclusive - resource managed * reset_control_bulk_get_exclusive() @@ -712,6 +746,25 @@ static inline struct reset_control *devm_reset_control_get_shared( return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_SHARED); } +/** + * devm_reset_control_get_shared_deasserted - resource managed + * reset_control_get_shared() + + * reset_control_deassert() + * @dev: device to be reset by the controller + * @id: reset line name + * + * Managed reset_control_get_shared() + reset_control_deassert(). For reset + * controllers returned from this function, reset_control_assert() + + * reset_control_put() is called automatically on driver detach. + * + * See devm_reset_control_get_shared() for more information. + */ +static inline struct reset_control * __must_check +devm_reset_control_get_shared_deasserted(struct device *dev, const char *id) +{ + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_SHARED_DEASSERTED); +} + /** * devm_reset_control_bulk_get_shared - resource managed * reset_control_bulk_get_shared() @@ -732,6 +785,28 @@ devm_reset_control_bulk_get_shared(struct device *dev, int num_rstcs, return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, RESET_CONTROL_SHARED); } +/** + * devm_reset_control_bulk_get_shared_deasserted - resource managed + * reset_control_bulk_get_shared() + + * reset_control_bulk_deassert() + * @dev: device to be reset by the controller + * @num_rstcs: number of entries in rstcs array + * @rstcs: array of struct reset_control_bulk_data with reset line names set + * + * Managed reset_control_bulk_get_shared() + reset_control_bulk_deassert(). For + * reset controllers returned from this function, reset_control_bulk_assert() + + * reset_control_bulk_put() are called automatically on driver detach. + * + * See devm_reset_control_bulk_get_shared() for more information. + */ +static inline int __must_check +devm_reset_control_bulk_get_shared_deasserted(struct device *dev, int num_rstcs, + struct reset_control_bulk_data *rstcs) +{ + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, + RESET_CONTROL_SHARED_DEASSERTED); +} + /** * devm_reset_control_get_optional_exclusive - resource managed * reset_control_get_optional_exclusive() @@ -750,6 +825,25 @@ static inline struct reset_control *devm_reset_control_get_optional_exclusive( return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE); } +/** + * devm_reset_control_get_optional_exclusive_deasserted - resource managed + * reset_control_get_optional_exclusive() + + * reset_control_deassert() + * @dev: device to be reset by the controller + * @id: reset line name + * + * Managed reset_control_get_optional_exclusive() + reset_control_deassert(). + * For reset controllers returned from this function, reset_control_assert() + + * reset_control_put() is called automatically on driver detach. + * + * See devm_reset_control_get_optional_exclusive() for more information. + */ +static inline struct reset_control * +devm_reset_control_get_optional_exclusive_deasserted(struct device *dev, const char *id) +{ + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE_DEASSERTED); +} + /** * devm_reset_control_bulk_get_optional_exclusive - resource managed * reset_control_bulk_get_optional_exclusive() @@ -789,6 +883,25 @@ static inline struct reset_control *devm_reset_control_get_optional_shared( return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_SHARED); } +/** + * devm_reset_control_get_optional_shared_deasserted - resource managed + * reset_control_get_optional_shared() + + * reset_control_deassert() + * @dev: device to be reset by the controller + * @id: reset line name + * + * Managed reset_control_get_optional_shared() + reset_control_deassert(). For + * reset controllers returned from this function, reset_control_assert() + + * reset_control_put() is called automatically on driver detach. + * + * See devm_reset_control_get_optional_shared() for more information. + */ +static inline struct reset_control * +devm_reset_control_get_optional_shared_deasserted(struct device *dev, const char *id) +{ + return __devm_reset_control_get(dev, id, 0, RESET_CONTROL_OPTIONAL_SHARED_DEASSERTED); +} + /** * devm_reset_control_bulk_get_optional_shared - resource managed * reset_control_bulk_get_optional_shared() -- cgit v1.2.3 From dab9cd4b8e7f5fce4e7a0424991ec4714a780f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 1 Oct 2024 10:51:39 +0200 Subject: pwm: Add kernel doc for members added to pwm_ops recently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The callbacks for lowlevel pwm drivers were expanded to handle the new waveform abstraction. When doing that I missed to expand the kernel doc description. This is catched up here. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/linux-next/20241001135207.125ca7af@canb.auug.org.au Fixes: 17e40c25158f ("pwm: New abstraction for PWM waveforms") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20241001085138.1025818-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index c3d9ddeafa65..f1cb1e5b0a36 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -276,6 +276,11 @@ struct pwm_capture { * @request: optional hook for requesting a PWM * @free: optional hook for freeing a PWM * @capture: capture and report PWM signal + * @sizeof_wfhw: size (in bytes) of driver specific waveform presentation + * @round_waveform_tohw: convert a struct pwm_waveform to driver specific presentation + * @round_waveform_fromhw: convert a driver specific waveform presentation to struct pwm_waveform + * @read_waveform: read driver specific waveform presentation from hardware + * @write_waveform: write driver specific waveform presentation to hardware * @apply: atomically apply a new PWM config * @get_state: get the current PWM state. */ -- cgit v1.2.3 From 030ace430afcf847f537227afceb22dfe8fb8fc8 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 26 Sep 2024 22:19:52 +0800 Subject: spi: spi-mem: Allow specifying the byte order in Octal DTR mode There are NOR flashes (Macronix) that swap the bytes on a 16-bit boundary when configured in Octal DTR mode. The byte order of 16-bit words is swapped when read or written in Octal Double Transfer Rate (DTR) mode compared to Single Transfer Rate (STR) modes. If one writes D0 D1 D2 D3 bytes using 1-1-1 mode, and uses 8D-8D-8D SPI mode for reading, it will read back D1 D0 D3 D2. Swapping the bytes may introduce some endianness problems. It can affect the boot sequence if the entire boot sequence is not handled in either 8D-8D-8D mode or 1-1-1 mode. Therefore, it is necessary to swap the bytes back to ensure the same byte order as in STR modes. Fortunately there are controllers that could swap the bytes back at runtime, addressing the flash's endianness requirements. Provide a way for the upper layers to specify the byte order in Octal DTR mode. Merge Tudor's patch and add modifications for suiting newer version of Linux kernel. Suggested-by: Michael Walle Signed-off-by: JaimeLiao Signed-off-by: AlvinZhou Acked-by: Mark Brown Link: https://lore.kernel.org/r/20240926141956.2386374-3-alvinzhou.tw@gmail.com Signed-off-by: Tudor Ambarus --- include/linux/spi/spi-mem.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index f866d5c8ed32..c46d2b8029be 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -90,6 +90,8 @@ enum spi_mem_data_dir { * @data.buswidth: number of IO lanes used to send/receive the data * @data.dtr: whether the data should be sent in DTR mode or not * @data.ecc: whether error correction is required or not + * @data.swap16: whether the byte order of 16-bit words is swapped when read + * or written in Octal DTR mode compared to STR mode. * @data.dir: direction of the transfer * @data.nbytes: number of data bytes to send/receive. Can be zero if the * operation does not involve transferring data @@ -124,7 +126,8 @@ struct spi_mem_op { u8 buswidth; u8 dtr : 1; u8 ecc : 1; - u8 __pad : 6; + u8 swap16 : 1; + u8 __pad : 5; enum spi_mem_data_dir dir; unsigned int nbytes; union { @@ -297,10 +300,13 @@ struct spi_controller_mem_ops { * struct spi_controller_mem_caps - SPI memory controller capabilities * @dtr: Supports DTR operations * @ecc: Supports operations with error correction + * @swap16: Supports swapping bytes on a 16 bit boundary when configured in + * Octal DTR */ struct spi_controller_mem_caps { bool dtr; bool ecc; + bool swap16; }; #define spi_mem_controller_is_capable(ctlr, cap) \ -- cgit v1.2.3 From 848f2bbb363d4cdb4202db328a103fd3c34e21a2 Mon Sep 17 00:00:00 2001 From: Ronak Jain Date: Tue, 30 Jul 2024 01:43:42 -0700 Subject: firmware: xilinx: Add missing debug firmware interfaces Add missing PM EEMI APIs interface in debug firmware driver. The debugfs firmware driver interface is intended for testing and debugging the EEMI APIs only. This interface does not contain any checking regarding improper usage, and the number, type and valid ranges of the arguments. This interface must be used with a lot of care. In fact, accessing this interface during normal PM operation will very likely cause unexpected problems. The debugfs interface shouldn't be used in the production system and hence it is disabled by default in defconfig. Signed-off-by: Ronak Jain Signed-off-by: Radhey Shyam Pandey Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/20240730084342.1683231-1-ronak.jain@amd.com --- include/linux/firmware/xlnx-zynqmp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index d7d07afc0532..563382cf16f2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -218,9 +218,13 @@ enum pm_ioctl_id { /* Runtime feature configuration */ IOCTL_SET_FEATURE_CONFIG = 26, IOCTL_GET_FEATURE_CONFIG = 27, + /* IOCTL for Secure Read/Write Interface */ + IOCTL_READ_REG = 28, /* Dynamic SD/GEM configuration */ IOCTL_SET_SD_CONFIG = 30, IOCTL_SET_GEM_CONFIG = 31, + /* IOCTL to get default/current QoS */ + IOCTL_GET_QOS = 34, }; enum pm_query_id { -- cgit v1.2.3 From f33d6099edf78e3c97900c0173fedbfecc025a9e Mon Sep 17 00:00:00 2001 From: Ronak Jain Date: Fri, 30 Aug 2024 03:00:42 -0700 Subject: firmware: xilinx: use u32 for reset ID in reset APIs Refactors the reset handling mechanisms by replacing the reset ID's enum type with a u32. This update improves flexibility, allowing the reset ID to accommodate a broader range of values, including those that may not fit into predefined enum values. The use of u32 for reset ID enhances extensibility, especially for hardware platforms or features where more granular control of reset operations is required. By shifting to a general integer type, this change reduces constraints and simplifies integration with other system components that rely on non-enum-based reset IDs. Signed-off-by: Ronak Jain Link: https://lore.kernel.org/r/20240830100042.3163511-1-ronak.jain@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-zynqmp.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 563382cf16f2..5b938fc2adad 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -557,9 +557,9 @@ int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data); int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value); int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type); int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select); -int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, +int zynqmp_pm_reset_assert(const u32 reset, const enum zynqmp_pm_reset_action assert_flag); -int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status); +int zynqmp_pm_reset_get_status(const u32 reset, u32 *status); unsigned int zynqmp_pm_bootmode_read(u32 *ps_mode); int zynqmp_pm_bootmode_write(u32 ps_mode); int zynqmp_pm_init_finalize(void); @@ -702,14 +702,13 @@ static inline int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select) return -ENODEV; } -static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, +static inline int zynqmp_pm_reset_assert(const u32 reset, const enum zynqmp_pm_reset_action assert_flag) { return -ENODEV; } -static inline int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, - u32 *status) +static inline int zynqmp_pm_reset_get_status(const u32 reset, u32 *status) { return -ENODEV; } -- cgit v1.2.3 From 92fb71333d5737d0296fb968a653dfda4b225175 Mon Sep 17 00:00:00 2001 From: Ronak Jain Date: Thu, 19 Sep 2024 22:55:01 -0700 Subject: firmware: xilinx: add support for new SMC call format Added zynqmp_pm_invoke_fw_fn() to use new SMC format in which lower 12 bits of SMC id are fixed and firmware header is moved to subsequent SMC arguments. The new SMC format supports full request and response buffers. Added zynqmp_pm_get_sip_svc_version() to get SiP SVC version number to check if TF-A is newer or older and use the SMC format accordingly to handle backward compatibility. Used new SMC format for PM_QUERY_DATA API as more response values are required in it. Signed-off-by: Ronak Jain Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20240920055501.2658642-1-ronak.jain@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-zynqmp.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 5b938fc2adad..76d85ad82ec0 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -3,7 +3,7 @@ * Xilinx Zynq MPSoC Firmware layer * * Copyright (C) 2014-2021 Xilinx - * Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. + * Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. * * Michal Simek * Davorin Mista @@ -32,6 +32,19 @@ /* SMC SIP service Call Function Identifier Prefix */ #define PM_SIP_SVC 0xC2000000 +/* SMC function ID to get SiP SVC version */ +#define GET_SIP_SVC_VERSION (0x8200ff03U) + +/* SiP Service Calls version numbers */ +#define SIP_SVC_VERSION_MAJOR (0U) +#define SIP_SVC_VERSION_MINOR (2U) + +#define SIP_SVC_PASSTHROUGH_VERSION ((SIP_SVC_VERSION_MAJOR << 16) | \ + SIP_SVC_VERSION_MINOR) + +/* Fixed ID for FW specific APIs */ +#define PASS_THROUGH_FW_CMD_ID GENMASK(11, 0) + /* PM API versions */ #define PM_API_VERSION_1 1 #define PM_API_VERSION_2 2 @@ -51,6 +64,7 @@ #define API_ID_MASK GENMASK(7, 0) #define MODULE_ID_MASK GENMASK(11, 8) +#define PLM_MODULE_ID_MASK GENMASK(15, 8) /* Firmware feature check version mask */ #define FIRMWARE_VERSION_MASK 0xFFFFU @@ -62,7 +76,13 @@ #define GET_CALLBACK_DATA 0xa01 /* Number of 32bits values in payload */ -#define PAYLOAD_ARG_CNT 4U +#define PAYLOAD_ARG_CNT 7U + +/* Number of 64bits arguments for SMC call */ +#define SMC_ARG_CNT_64 8U + +/* Number of 32bits arguments for SMC call */ +#define SMC_ARG_CNT_32 13U /* Number of arguments for a callback */ #define CB_ARG_CNT 4 @@ -130,6 +150,7 @@ enum pm_module_id { PM_MODULE_ID = 0x0, + XPM_MODULE_ID = 0x2, XSEM_MODULE_ID = 0x3, TF_A_MODULE_ID = 0xa, }; @@ -537,6 +558,7 @@ struct zynqmp_pm_query_data { }; int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...); +int zynqmp_pm_invoke_fw_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...); #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) int zynqmp_pm_get_api_version(u32 *version); -- cgit v1.2.3 From a849a0273d0f73a252d14d31c5003a8059ea51fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:37 +0200 Subject: ntp: Remove unused tick_nsec tick_nsec is only updated in the NTP core, but there are no users. Remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-1-2d52f4e13476@linutronix.de --- include/linux/timex.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 3871b06bd302..7f7a12fd8200 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -145,7 +145,6 @@ unsigned long random_get_entropy_fallback(void); * estimated error = NTP dispersion. */ extern unsigned long tick_usec; /* USER_HZ period (usec) */ -extern unsigned long tick_nsec; /* SHIFTED_HZ period (nsec) */ /* Required to safely shift negative values */ #define shift_right(x, s) ({ \ -- cgit v1.2.3 From 66606a93849bfe3cbe9f0b801b40f60b87c54e11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Sep 2024 15:17:38 +0200 Subject: ntp: Make tick_usec static There are no users of tick_usec outside of the NTP core code. Therefore make tick_usec static. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911-devel-anna-maria-b4-timers-ptp-ntp-v1-2-2d52f4e13476@linutronix.de --- include/linux/timex.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 7f7a12fd8200..4ee32eff3f22 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -139,13 +139,6 @@ unsigned long random_get_entropy_fallback(void); #define MAXSEC 2048 /* max interval between updates (s) */ #define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */ -/* - * kernel variables - * Note: maximum error = NTP sync distance = dispersion + delay / 2; - * estimated error = NTP dispersion. - */ -extern unsigned long tick_usec; /* USER_HZ period (usec) */ - /* Required to safely shift negative values */ #define shift_right(x, s) ({ \ __typeof__(x) __x = (x); \ -- cgit v1.2.3 From 8102c4daf44ab86c2d2226a8136bec905d6e2bd1 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 11 Sep 2024 10:30:20 +0100 Subject: timekeeping: Add the boot clock to system time snapshot For tracing purpose, the boot clock is interesting as it doesn't stop on suspend. Export it as part of the time snapshot. This will later allow the hypervisor to add boot clock timestamps to its events. Signed-off-by: Vincent Donnefort Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20240911093029.3279154-5-vdonnefort@google.com --- include/linux/timekeeping.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fc12a9ba2c88..e85c27347e44 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -275,6 +275,7 @@ struct ktime_timestamps { * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time + * @boot: Boot time * @raw: Monotonic raw system time * @cs_id: Clocksource ID * @clock_was_set_seq: The sequence number of clock-was-set events @@ -283,6 +284,7 @@ struct ktime_timestamps { struct system_time_snapshot { u64 cycles; ktime_t real; + ktime_t boot; ktime_t raw; enum clocksource_ids cs_id; unsigned int clock_was_set_seq; -- cgit v1.2.3 From f69767a1ada3ac74be2e1ac0795a05e1d1384eff Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 2 Oct 2024 11:59:50 -0500 Subject: PCI: Add TLP Processing Hints (TPH) support Add support for PCIe TLP Processing Hints (TPH) support (see PCIe r6.2, sec 6.17). Add TPH register definitions in pci_regs.h, including the TPH Requester capability register, TPH Requester control register, TPH Completer capability, and the ST fields of MSI-X entry. Introduce pcie_enable_tph() and pcie_disable_tph(), enabling drivers to toggle TPH support and configure specific ST mode as needed. Also add a new kernel parameter, "pci=notph", allowing users to disable TPH support across the entire system. Link: https://lore.kernel.org/r/20241002165954.128085-2-wei.huang2@amd.com Co-developed-by: Jing Liu Co-developed-by: Paul Luse Co-developed-by: Eric Van Tassell Signed-off-by: Jing Liu Signed-off-by: Paul Luse Signed-off-by: Eric Van Tassell Signed-off-by: Wei Huang Signed-off-by: Bjorn Helgaas Reviewed-by: Ajit Khaparde Reviewed-by: Somnath Kotur Reviewed-by: Andy Gospodarek Reviewed-by: Jonathan Cameron Reviewed-by: Lukas Wunner --- include/linux/pci-tph.h | 21 +++++++++++++++++++++ include/linux/pci.h | 7 +++++++ 2 files changed, 28 insertions(+) create mode 100644 include/linux/pci-tph.h (limited to 'include/linux') diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h new file mode 100644 index 000000000000..58654a334ffb --- /dev/null +++ b/include/linux/pci-tph.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * TPH (TLP Processing Hints) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * Eric Van Tassell + * Wei Huang + */ +#ifndef LINUX_PCI_TPH_H +#define LINUX_PCI_TPH_H + +#ifdef CONFIG_PCIE_TPH +void pcie_disable_tph(struct pci_dev *pdev); +int pcie_enable_tph(struct pci_dev *pdev, int mode); +#else +static inline void pcie_disable_tph(struct pci_dev *pdev) { } +static inline int pcie_enable_tph(struct pci_dev *pdev, int mode) +{ return -EINVAL; } +#endif + +#endif /* LINUX_PCI_TPH_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 573b4c4c2be6..8351d76b6e12 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -434,6 +434,7 @@ struct pci_dev { unsigned int ats_enabled:1; /* Address Translation Svc */ unsigned int pasid_enabled:1; /* Process Address Space ID */ unsigned int pri_enabled:1; /* Page Request Interface */ + unsigned int tph_enabled:1; /* TLP Processing Hints */ unsigned int is_managed:1; /* Managed via devres */ unsigned int is_msi_managed:1; /* MSI release via devres installed */ unsigned int needs_freset:1; /* Requires fundamental reset */ @@ -534,6 +535,12 @@ struct pci_dev { /* These methods index pci_reset_fn_methods[] */ u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */ + +#ifdef CONFIG_PCIE_TPH + u16 tph_cap; /* TPH capability offset */ + u8 tph_mode; /* TPH mode */ + u8 tph_req_type; /* TPH requester type */ +#endif }; static inline struct pci_dev *pci_physfn(struct pci_dev *dev) -- cgit v1.2.3 From d2e8a34876ce69b27f450eebfc550ab8e316f752 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 2 Oct 2024 11:59:51 -0500 Subject: PCI/TPH: Add Steering Tag support Add pcie_tph_get_cpu_st() to allow a caller to retrieve Steering Tags for a target memory associated with a specific CPU. The ST tag is retrieved by invoking PCI ACPI "_DSM to Query Cache Locality TPH Features" method (rev=0x7, func=0xF) of the device's Root Port device. Add pcie_tph_set_st_entry() to update the device's Steering Tags. The tags will be written into the device's MSI-X table or the ST table located in the TPH Extended Capability space. Co-developed-by: Eric Van Tassell Link: https://lore.kernel.org/r/20241002165954.128085-3-wei.huang2@amd.com Signed-off-by: Eric Van Tassell Signed-off-by: Wei Huang Signed-off-by: Bjorn Helgaas Reviewed-by: Ajit Khaparde Reviewed-by: Somnath Kotur Reviewed-by: Andy Gospodarek --- include/linux/pci-tph.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h index 58654a334ffb..c3e806c13d64 100644 --- a/include/linux/pci-tph.h +++ b/include/linux/pci-tph.h @@ -9,10 +9,33 @@ #ifndef LINUX_PCI_TPH_H #define LINUX_PCI_TPH_H +/* + * According to the ECN for PCI Firmware Spec, Steering Tag can be different + * depending on the memory type: Volatile Memory or Persistent Memory. When a + * caller query about a target's Steering Tag, it must provide the target's + * tph_mem_type. ECN link: https://members.pcisig.com/wg/PCI-SIG/document/15470. + */ +enum tph_mem_type { + TPH_MEM_TYPE_VM, /* volatile memory */ + TPH_MEM_TYPE_PM /* persistent memory */ +}; + #ifdef CONFIG_PCIE_TPH +int pcie_tph_set_st_entry(struct pci_dev *pdev, + unsigned int index, u16 tag); +int pcie_tph_get_cpu_st(struct pci_dev *dev, + enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *tag); void pcie_disable_tph(struct pci_dev *pdev); int pcie_enable_tph(struct pci_dev *pdev, int mode); #else +static inline int pcie_tph_set_st_entry(struct pci_dev *pdev, + unsigned int index, u16 tag) +{ return -EINVAL; } +static inline int pcie_tph_get_cpu_st(struct pci_dev *dev, + enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *tag) +{ return -EINVAL; } static inline void pcie_disable_tph(struct pci_dev *pdev) { } static inline int pcie_enable_tph(struct pci_dev *pdev, int mode) { return -EINVAL; } -- cgit v1.2.3 From 277b339c4ba5c3d51f86892efd7bfbb012318ed8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 1 Oct 2024 17:04:10 +0100 Subject: net: pcs: xpcs: move PCS reset to .pcs_pre_config() Move the PCS reset to .pcs_pre_config() rather than at creation time, which means we call the reset function with the interface that we're actually going to be using to talk to the downstream device. Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean # sja1105 Signed-off-by: Russell King (Oracle) Tested-by: for them? Link: https://patch.msgid.link/E1svfMA-005ZI3-Va@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-xpcs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index b4a4eb6c8866..fd75d0605bb6 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -61,6 +61,7 @@ struct dw_xpcs { struct clk_bulk_data clks[DW_XPCS_NUM_CLKS]; struct phylink_pcs pcs; phy_interface_t interface; + bool need_reset; }; int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -- cgit v1.2.3 From bedea1539acb808c870ad85b16f8d8f0bef5e0bb Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 1 Oct 2024 17:04:26 +0100 Subject: net: pcs: xpcs: add xpcs_destroy_pcs() and xpcs_create_pcs_mdiodev() Provide xpcs create/destroy functions that return and take a phylink_pcs pointer instead of an xpcs pointer. This will be used by drivers that have been converted to use phylink_pcs pointers internally, rather than dw_xpcs pointers. As xpcs_create_mdiodev() no longer makes use of its interface argument, pass PHY_INTERFACE_MODE_NA into xpcs_create_mdiodev() until it is removed later in the series. Reviewed-by: Vladimir Oltean Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1svfMQ-005ZIL-Bi@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-xpcs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index fd75d0605bb6..a4e2243ce647 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -78,4 +78,7 @@ struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode, phy_interface_t interface); void xpcs_destroy(struct dw_xpcs *xpcs); +struct phylink_pcs *xpcs_create_pcs_mdiodev(struct mii_bus *bus, int addr); +void xpcs_destroy_pcs(struct phylink_pcs *pcs); + #endif /* __LINUX_PCS_XPCS_H */ -- cgit v1.2.3 From bf5a61645bb2d51be53da5b26f948045b571147c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 1 Oct 2024 17:04:51 +0100 Subject: net: pcs: xpcs: drop interface argument from xpcs_create*() The XPCS sub-driver no longer uses the "interface" argument to the xpcs_create_mdiodev() and xpcs_create_fwnode() functions. Remove this now unnecessary argument, updating the stmmac driver appropriately. Reviewed-by: Vladimir Oltean Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1svfMp-005ZIp-UX@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-xpcs.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index a4e2243ce647..758daabb76c7 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -72,10 +72,8 @@ int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface, void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); -struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr, - phy_interface_t interface); -struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode, - phy_interface_t interface); +struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr); +struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode); void xpcs_destroy(struct dw_xpcs *xpcs); struct phylink_pcs *xpcs_create_pcs_mdiodev(struct mii_bus *bus, int addr); -- cgit v1.2.3 From faefc9730d073e88a490e37b00e3e196b823abcb Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 1 Oct 2024 17:04:57 +0100 Subject: net: pcs: xpcs: make xpcs_do_config() and xpcs_link_up() internal As nothing outside pcs-xpcs.c calls neither xpcs_do_config() nor xpcs_link_up(), remove their exports and prototypes. Reviewed-by: Vladimir Oltean Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1svfMv-005ZIv-2M@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-xpcs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 758daabb76c7..abda475111d1 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -65,10 +65,6 @@ struct dw_xpcs { }; int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -void xpcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, - phy_interface_t interface, int speed, int duplex); -int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface, - const unsigned long *advertising, unsigned int neg_mode); void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); -- cgit v1.2.3 From ced20ea315fe8591093f19574ec32222c1ab71ba Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Tue, 24 Sep 2024 13:48:43 +0530 Subject: soundwire: amd: pass acp pci revision id as resource data Pass ACP pci revision id as resource data and store it in amd SoundWire manager private data structure. This field will be used to differentiate ACP variants. Signed-off-by: Vijendar Mukunda Link: https://lore.kernel.org/r/20240924081846.1834612-2-Vijendar.Mukunda@amd.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_amd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index 28a4eb77717f..e0abc59d4748 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -30,6 +30,7 @@ struct acp_sdw_pdata { u16 instance; + u32 acp_rev; /* mutex to protect acp common register access */ struct mutex *acp_sdw_lock; }; @@ -66,6 +67,7 @@ struct sdw_amd_dai_runtime { * @instance: SoundWire manager instance * @quirks: SoundWire manager quirks * @wake_en_mask: wake enable mask per SoundWire manager + * @acp_rev: acp pci device revision id * @clk_stopped: flag set to true when clock is stopped * @power_mode_mask: flag interprets amd SoundWire manager power mode * @dai_runtime_array: dai runtime array @@ -94,6 +96,7 @@ struct amd_sdw_manager { u32 quirks; u32 wake_en_mask; u32 power_mode_mask; + u32 acp_rev; bool clk_stopped; struct sdw_amd_dai_runtime **dai_runtime_array; @@ -134,6 +137,7 @@ struct sdw_amd_ctx { * struct sdw_amd_res - Soundwire AMD global resource structure, * typically populated by the DSP driver/Legacy driver * + * @acp_rev: acp pci device revision id * @addr: acp pci device resource start address * @reg_range: ACP register range * @link_mask: bit-wise mask listing links selected by the DSP driver/ @@ -146,6 +150,7 @@ struct sdw_amd_ctx { * @acp_lock: mutex protecting acp common registers access */ struct sdw_amd_res { + u32 acp_rev; u32 addr; u32 reg_range; u32 link_mask; -- cgit v1.2.3 From 7b54323dde29452dd06e6acd2701d9b489c9547d Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Tue, 24 Sep 2024 13:48:44 +0530 Subject: soundwire: amd: refactor existing code for acp 6.3 platform Refactor existing code by adding acp pci revision id coditional checks for ACP 6.3 platform. Rename the macros and structure names with ACP63 tag. Signed-off-by: Vijendar Mukunda Link: https://lore.kernel.org/r/20240924081846.1834612-3-Vijendar.Mukunda@amd.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_amd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index e0abc59d4748..c9586f22c5a9 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -27,6 +27,7 @@ #define ACP_SDW0 0 #define ACP_SDW1 1 #define AMD_SDW_MAX_MANAGER_COUNT 2 +#define ACP63_PCI_REV_ID 0x63 struct acp_sdw_pdata { u16 instance; -- cgit v1.2.3 From 444d6824a4feca142b0a57095a2f1f1bda98e2ab Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:38 +0800 Subject: soundwire: optimize sdw_stream_runtime memory layout pahole suggestion: swap position of 'm_rt_count' before: pahole -C sdw_stream_runtime drivers/soundwire/soundwire-bus.ko struct sdw_stream_runtime { const char * name; /* 0 8 */ struct sdw_stream_params params; /* 8 12 */ enum sdw_stream_state state; /* 20 4 */ enum sdw_stream_type type; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head master_list; /* 32 16 */ int m_rt_count; /* 48 4 */ /* size: 56, cachelines: 1, members: 6 */ /* sum members: 48, holes: 1, sum holes: 4 */ /* padding: 4 */ /* last cacheline: 56 bytes */ }; after: pahole --reorganize -C sdw_stream_runtime drivers/soundwire/soundwire-bus.ko struct sdw_stream_runtime { const char * name; /* 0 8 */ struct sdw_stream_params params; /* 8 12 */ enum sdw_stream_state state; /* 20 4 */ enum sdw_stream_type type; /* 24 4 */ int m_rt_count; /* 28 4 */ struct list_head master_list; /* 32 16 */ /* size: 48, cachelines: 1, members: 6 */ /* last cacheline: 48 bytes */ }; /* saved 8 bytes! */ Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-3-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 5e0dd47a0412..a4fa45132030 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -820,15 +820,15 @@ struct sdw_master_port_ops { struct sdw_msg; /** - * struct sdw_defer - SDW deffered message - * @length: message length + * struct sdw_defer - SDW deferred message * @complete: message completion * @msg: SDW message + * @length: message length */ struct sdw_defer { + struct sdw_msg *msg; int length; struct completion complete; - struct sdw_msg *msg; }; /** @@ -1010,18 +1010,18 @@ struct sdw_stream_params { * @params: Stream parameters * @state: Current state of the stream * @type: Stream type PCM or PDM + * @m_rt_count: Count of Master runtime(s) in this stream * @master_list: List of Master runtime(s) in this stream. * master_list can contain only one m_rt per Master instance * for a stream - * @m_rt_count: Count of Master runtime(s) in this stream */ struct sdw_stream_runtime { const char *name; struct sdw_stream_params params; enum sdw_stream_state state; enum sdw_stream_type type; - struct list_head master_list; int m_rt_count; + struct list_head master_list; }; struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name); -- cgit v1.2.3 From 6cb2c156439430a7f9db2e1f71a7dccf1ca978bf Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:39 +0800 Subject: soundwire: optimize sdw_master_prop Make pahole happy by moving pointers and u64 first instead of interleaving them. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-4-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index a4fa45132030..2caea7345c3e 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -406,13 +406,14 @@ struct sdw_slave_prop { /** * struct sdw_master_prop - Master properties + * @clk_gears: Clock gears supported + * @clk_freq: Clock frequencies supported, in Hz + * @quirks: bitmask identifying optional behavior beyond the scope of the MIPI specification * @revision: MIPI spec version of the implementation * @clk_stop_modes: Bitmap, bit N set when clock-stop-modeN supported * @max_clk_freq: Maximum Bus clock frequency, in Hz * @num_clk_gears: Number of clock gears supported - * @clk_gears: Clock gears supported * @num_clk_freq: Number of clock frequencies supported, in Hz - * @clk_freq: Clock frequencies supported, in Hz * @default_frame_rate: Controller default Frame rate, in Hz * @default_row: Number of rows * @default_col: Number of columns @@ -421,24 +422,23 @@ struct sdw_slave_prop { * command * @mclk_freq: clock reference passed to SoundWire Master, in Hz. * @hw_disabled: if true, the Master is not functional, typically due to pin-mux - * @quirks: bitmask identifying optional behavior beyond the scope of the MIPI specification */ struct sdw_master_prop { + u32 *clk_gears; + u32 *clk_freq; + u64 quirks; u32 revision; u32 clk_stop_modes; u32 max_clk_freq; u32 num_clk_gears; - u32 *clk_gears; u32 num_clk_freq; - u32 *clk_freq; u32 default_frame_rate; u32 default_row; u32 default_col; - bool dynamic_frame; u32 err_threshold; u32 mclk_freq; + bool dynamic_frame; bool hw_disabled; - u64 quirks; }; /* Definitions for Master quirks */ -- cgit v1.2.3 From 0a323dad1c4e04048988cd04c60eaffd6ae61b1a Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:40 +0800 Subject: soundwire: optimize sdw_bus structure The sdw_bus structure has seen multiple additions over the years. It's one of the most used structures in this subsystem, so there's merit in reshuffling the members a bit with 'pahole' to reduce holes and structures across cache lines. before: struct sdw_bus { struct device * dev; /* 0 8 */ struct sdw_master_device * md; /* 8 8 */ int controller_id; /* 16 4 */ unsigned int link_id; /* 20 4 */ int id; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head slaves; /* 32 16 */ long unsigned int assigned[1]; /* 48 8 */ struct mutex bus_lock; /* 56 160 */ /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */ struct lock_class_key bus_lock_key; /* 216 16 */ struct mutex msg_lock; /* 232 160 */ /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */ struct lock_class_key msg_lock_key; /* 392 16 */ int (*compute_params)(struct sdw_bus *); /* 408 8 */ const struct sdw_master_ops * ops; /* 416 8 */ const struct sdw_master_port_ops * port_ops; /* 424 8 */ struct sdw_bus_params params; /* 432 36 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 7 boundary (448 bytes) was 24 bytes ago --- */ struct sdw_master_prop prop; /* 472 72 */ /* XXX last struct has 6 bytes of padding */ /* --- cacheline 8 boundary (512 bytes) was 32 bytes ago --- */ void * vendor_specific_prop; /* 544 8 */ struct list_head m_rt_list; /* 552 16 */ struct dentry * debugfs; /* 568 8 */ /* --- cacheline 9 boundary (576 bytes) --- */ struct irq_chip irq_chip; /* 576 264 */ /* --- cacheline 13 boundary (832 bytes) was 8 bytes ago --- */ struct irq_domain * domain; /* 840 8 */ struct sdw_defer defer_msg; /* 848 112 */ /* --- cacheline 15 boundary (960 bytes) --- */ unsigned int clk_stop_timeout; /* 960 4 */ u32 bank_switch_timeout; /* 964 4 */ bool multi_link; /* 968 1 */ /* XXX 3 bytes hole, try to pack */ int hw_sync_min_links; /* 972 4 */ int stream_refcount; /* 976 4 */ /* size: 984, cachelines: 16, members: 27 */ /* sum members: 969, holes: 3, sum holes: 11 */ /* padding: 4 */ /* paddings: 1, sum paddings: 6 */ /* last cacheline: 24 bytes */ }; after: struct sdw_bus { struct device * dev; /* 0 8 */ struct sdw_master_device * md; /* 8 8 */ struct lock_class_key bus_lock_key; /* 16 16 */ struct mutex bus_lock; /* 32 160 */ /* --- cacheline 3 boundary (192 bytes) --- */ struct list_head slaves; /* 192 16 */ struct lock_class_key msg_lock_key; /* 208 16 */ struct mutex msg_lock; /* 224 160 */ /* --- cacheline 6 boundary (384 bytes) --- */ struct list_head m_rt_list; /* 384 16 */ struct sdw_defer defer_msg; /* 400 112 */ /* --- cacheline 8 boundary (512 bytes) --- */ struct sdw_bus_params params; /* 512 36 */ int stream_refcount; /* 548 4 */ const struct sdw_master_ops * ops; /* 552 8 */ const struct sdw_master_port_ops * port_ops; /* 560 8 */ struct sdw_master_prop prop; /* 568 72 */ /* XXX last struct has 6 bytes of padding */ /* --- cacheline 10 boundary (640 bytes) --- */ void * vendor_specific_prop; /* 640 8 */ int hw_sync_min_links; /* 648 4 */ int controller_id; /* 652 4 */ unsigned int link_id; /* 656 4 */ int id; /* 660 4 */ int (*compute_params)(struct sdw_bus *); /* 664 8 */ long unsigned int assigned[1]; /* 672 8 */ unsigned int clk_stop_timeout; /* 680 4 */ u32 bank_switch_timeout; /* 684 4 */ struct irq_chip irq_chip; /* 688 264 */ /* --- cacheline 14 boundary (896 bytes) was 56 bytes ago --- */ struct irq_domain * domain; /* 952 8 */ /* --- cacheline 15 boundary (960 bytes) --- */ struct dentry * debugfs; /* 960 8 */ bool multi_link; /* 968 1 */ /* size: 976, cachelines: 16, members: 27 */ /* padding: 7 */ /* paddings: 1, sum paddings: 6 */ /* last cacheline: 16 bytes */ }; Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-5-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 77 ++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 2caea7345c3e..6fcf122c1831 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -871,68 +871,71 @@ struct sdw_master_ops { * struct sdw_bus - SoundWire bus * @dev: Shortcut to &bus->md->dev to avoid changing the entire code. * @md: Master device - * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used. - * @link_id: Link id number, can be 0 to N, unique for each Controller - * @id: bus system-wide unique id - * @slaves: list of Slaves on this bus - * @assigned: Bitmap for Slave device numbers. - * Bit set implies used number, bit clear implies unused number. + * @bus_lock_key: bus lock key associated to @bus_lock * @bus_lock: bus lock + * @slaves: list of Slaves on this bus + * @msg_lock_key: message lock key associated to @msg_lock * @msg_lock: message lock - * @compute_params: points to Bus resource management implementation - * @ops: Master callback ops - * @port_ops: Master port callback ops - * @params: Current bus parameters - * @prop: Master properties - * @vendor_specific_prop: pointer to non-standard properties * @m_rt_list: List of Master instance of all stream(s) running on Bus. This * is used to compute and program bus bandwidth, clock, frame shape, * transport and port parameters - * @debugfs: Bus debugfs - * @domain: IRQ domain * @defer_msg: Defer message - * @clk_stop_timeout: Clock stop timeout computed - * @bank_switch_timeout: Bank switch timeout computed - * @multi_link: Store bus property that indicates if multi links - * are supported. This flag is populated by drivers after reading - * appropriate firmware (ACPI/DT). + * @params: Current bus parameters + * @stream_refcount: number of streams currently using this bus + * @ops: Master callback ops + * @port_ops: Master port callback ops + * @prop: Master properties + * @vendor_specific_prop: pointer to non-standard properties * @hw_sync_min_links: Number of links used by a stream above which * hardware-based synchronization is required. This value is only * meaningful if multi_link is set. If set to 1, hardware-based * synchronization will be used even if a stream only uses a single * SoundWire segment. - * @stream_refcount: number of streams currently using this bus + * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used. + * @link_id: Link id number, can be 0 to N, unique for each Controller + * @id: bus system-wide unique id + * @compute_params: points to Bus resource management implementation + * @assigned: Bitmap for Slave device numbers. + * Bit set implies used number, bit clear implies unused number. + * @clk_stop_timeout: Clock stop timeout computed + * @bank_switch_timeout: Bank switch timeout computed + * @domain: IRQ domain + * @irq_chip: IRQ chip + * @debugfs: Bus debugfs (optional) + * @multi_link: Store bus property that indicates if multi links + * are supported. This flag is populated by drivers after reading + * appropriate firmware (ACPI/DT). */ struct sdw_bus { struct device *dev; struct sdw_master_device *md; - int controller_id; - unsigned int link_id; - int id; - struct list_head slaves; - DECLARE_BITMAP(assigned, SDW_MAX_DEVICES); - struct mutex bus_lock; struct lock_class_key bus_lock_key; - struct mutex msg_lock; + struct mutex bus_lock; + struct list_head slaves; struct lock_class_key msg_lock_key; - int (*compute_params)(struct sdw_bus *bus); + struct mutex msg_lock; + struct list_head m_rt_list; + struct sdw_defer defer_msg; + struct sdw_bus_params params; + int stream_refcount; const struct sdw_master_ops *ops; const struct sdw_master_port_ops *port_ops; - struct sdw_bus_params params; struct sdw_master_prop prop; void *vendor_specific_prop; - struct list_head m_rt_list; + int hw_sync_min_links; + int controller_id; + unsigned int link_id; + int id; + int (*compute_params)(struct sdw_bus *bus); + DECLARE_BITMAP(assigned, SDW_MAX_DEVICES); + unsigned int clk_stop_timeout; + u32 bank_switch_timeout; + struct irq_chip irq_chip; + struct irq_domain *domain; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs; #endif - struct irq_chip irq_chip; - struct irq_domain *domain; - struct sdw_defer defer_msg; - unsigned int clk_stop_timeout; - u32 bank_switch_timeout; bool multi_link; - int hw_sync_min_links; - int stream_refcount; }; int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, -- cgit v1.2.3 From 1c758df5a83ea0c9b5055536336d8a586b5010b0 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:41 +0800 Subject: soundwire: optimize sdw_slave_prop move pointers first, and move booleans together. before: struct sdw_slave_prop { u32 mipi_revision; /* 0 4 */ bool wake_capable; /* 4 1 */ bool test_mode_capable; /* 5 1 */ bool clk_stop_mode1; /* 6 1 */ bool simple_clk_stop_capable; /* 7 1 */ u32 clk_stop_timeout; /* 8 4 */ u32 ch_prep_timeout; /* 12 4 */ enum sdw_clk_stop_reset_behave reset_behave; /* 16 4 */ bool high_PHY_capable; /* 20 1 */ bool paging_support; /* 21 1 */ bool bank_delay_support; /* 22 1 */ /* XXX 1 byte hole, try to pack */ enum sdw_p15_behave p15_behave; /* 24 4 */ bool lane_control_support; /* 28 1 */ /* XXX 3 bytes hole, try to pack */ u32 master_count; /* 32 4 */ u32 source_ports; /* 36 4 */ u32 sink_ports; /* 40 4 */ /* XXX 4 bytes hole, try to pack */ struct sdw_dp0_prop * dp0_prop; /* 48 8 */ struct sdw_dpn_prop * src_dpn_prop; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct sdw_dpn_prop * sink_dpn_prop; /* 64 8 */ u8 scp_int1_mask; /* 72 1 */ /* XXX 3 bytes hole, try to pack */ u32 quirks; /* 76 4 */ bool clock_reg_supported; /* 80 1 */ bool use_domain_irq; /* 81 1 */ /* size: 88, cachelines: 2, members: 23 */ /* sum members: 71, holes: 4, sum holes: 11 */ /* padding: 6 */ /* last cacheline: 24 bytes */ }; after: truct sdw_slave_prop { struct sdw_dp0_prop * dp0_prop; /* 0 8 */ struct sdw_dpn_prop * src_dpn_prop; /* 8 8 */ struct sdw_dpn_prop * sink_dpn_prop; /* 16 8 */ u32 mipi_revision; /* 24 4 */ bool wake_capable; /* 28 1 */ bool test_mode_capable; /* 29 1 */ bool clk_stop_mode1; /* 30 1 */ bool simple_clk_stop_capable; /* 31 1 */ u32 clk_stop_timeout; /* 32 4 */ u32 ch_prep_timeout; /* 36 4 */ enum sdw_clk_stop_reset_behave reset_behave; /* 40 4 */ bool high_PHY_capable; /* 44 1 */ bool paging_support; /* 45 1 */ bool bank_delay_support; /* 46 1 */ bool lane_control_support; /* 47 1 */ enum sdw_p15_behave p15_behave; /* 48 4 */ u32 master_count; /* 52 4 */ u32 source_ports; /* 56 4 */ u32 sink_ports; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ u32 quirks; /* 64 4 */ u8 scp_int1_mask; /* 68 1 */ bool clock_reg_supported; /* 69 1 */ bool use_domain_irq; /* 70 1 */ /* size: 72, cachelines: 2, members: 23 */ /* padding: 1 */ /* last cacheline: 8 bytes */ }; Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-6-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 6fcf122c1831..38db81f5bdb9 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -344,6 +344,9 @@ struct sdw_dpn_prop { /** * struct sdw_slave_prop - SoundWire Slave properties + * @dp0_prop: Data Port 0 properties + * @src_dpn_prop: Source Data Port N properties + * @sink_dpn_prop: Sink Data Port N properties * @mipi_revision: Spec version of the implementation * @wake_capable: Wake-up events are supported * @test_mode_capable: If test mode is supported @@ -360,15 +363,12 @@ struct sdw_dpn_prop { * SCP_AddrPage2 * @bank_delay_support: Slave implements bank delay/bridge support registers * SCP_BankDelay and SCP_NextFrame + * @lane_control_support: Slave supports lane control * @p15_behave: Slave behavior when the Master attempts a read to the Port15 * alias - * @lane_control_support: Slave supports lane control * @master_count: Number of Masters present on this Slave * @source_ports: Bitmap identifying source ports * @sink_ports: Bitmap identifying sink ports - * @dp0_prop: Data Port 0 properties - * @src_dpn_prop: Source Data Port N properties - * @sink_dpn_prop: Sink Data Port N properties * @scp_int1_mask: SCP_INT1_MASK desired settings * @quirks: bitmask identifying deltas from the MIPI specification * @clock_reg_supported: the Peripheral implements the clock base and scale @@ -377,6 +377,9 @@ struct sdw_dpn_prop { * @use_domain_irq: call actual IRQ handler on slave, as well as callback */ struct sdw_slave_prop { + struct sdw_dp0_prop *dp0_prop; + struct sdw_dpn_prop *src_dpn_prop; + struct sdw_dpn_prop *sink_dpn_prop; u32 mipi_revision; bool wake_capable; bool test_mode_capable; @@ -388,16 +391,13 @@ struct sdw_slave_prop { bool high_PHY_capable; bool paging_support; bool bank_delay_support; - enum sdw_p15_behave p15_behave; bool lane_control_support; + enum sdw_p15_behave p15_behave; u32 master_count; u32 source_ports; u32 sink_ports; - struct sdw_dp0_prop *dp0_prop; - struct sdw_dpn_prop *src_dpn_prop; - struct sdw_dpn_prop *sink_dpn_prop; - u8 scp_int1_mask; u32 quirks; + u8 scp_int1_mask; bool clock_reg_supported; bool use_domain_irq; }; -- cgit v1.2.3 From 557e28f8b53243097162cf4d3e59bcee9fb9713b Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:42 +0800 Subject: soundwire: optimize sdw_dp0_prop Move pointers and booleans. Before: struct sdw_dp0_prop { u32 max_word; /* 0 4 */ u32 min_word; /* 4 4 */ u32 num_words; /* 8 4 */ /* XXX 4 bytes hole, try to pack */ u32 * words; /* 16 8 */ bool BRA_flow_controlled; /* 24 1 */ bool simple_ch_prep_sm; /* 25 1 */ /* XXX 2 bytes hole, try to pack */ u32 ch_prep_timeout; /* 28 4 */ bool imp_def_interrupts; /* 32 1 */ /* size: 40, cachelines: 1, members: 8 */ /* sum members: 27, holes: 2, sum holes: 6 */ /* padding: 7 */ /* last cacheline: 40 bytes */ }; after: struct sdw_dp0_prop { u32 * words; /* 0 8 */ u32 max_word; /* 8 4 */ u32 min_word; /* 12 4 */ u32 num_words; /* 16 4 */ u32 ch_prep_timeout; /* 20 4 */ bool BRA_flow_controlled; /* 24 1 */ bool simple_ch_prep_sm; /* 25 1 */ bool imp_def_interrupts; /* 26 1 */ /* size: 32, cachelines: 1, members: 8 */ /* padding: 5 */ /* last cacheline: 32 bytes */ }; Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-7-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 38db81f5bdb9..c72095137a35 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -226,16 +226,16 @@ enum sdw_clk_stop_mode { /** * struct sdw_dp0_prop - DP0 properties + * @words: wordlengths supported * @max_word: Maximum number of bits in a Payload Channel Sample, 1 to 64 * (inclusive) * @min_word: Minimum number of bits in a Payload Channel Sample, 1 to 64 * (inclusive) * @num_words: number of wordlengths supported - * @words: wordlengths supported + * @ch_prep_timeout: Port-specific timeout value, in milliseconds * @BRA_flow_controlled: Slave implementation results in an OK_NotReady * response * @simple_ch_prep_sm: If channel prepare sequence is required - * @ch_prep_timeout: Port-specific timeout value, in milliseconds * @imp_def_interrupts: If set, each bit corresponds to support for * implementation-defined interrupts * @@ -244,13 +244,13 @@ enum sdw_clk_stop_mode { * support */ struct sdw_dp0_prop { + u32 *words; u32 max_word; u32 min_word; u32 num_words; - u32 *words; + u32 ch_prep_timeout; bool BRA_flow_controlled; bool simple_ch_prep_sm; - u32 ch_prep_timeout; bool imp_def_interrupts; }; -- cgit v1.2.3 From 9942f90bdcc035eb5f01d7343dac99bd805ef3ec Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:43 +0800 Subject: soundwire: optimize sdw_dpn_prop before: struct sdw_dpn_prop { u32 num; /* 0 4 */ u32 max_word; /* 4 4 */ u32 min_word; /* 8 4 */ u32 num_words; /* 12 4 */ u32 * words; /* 16 8 */ enum sdw_dpn_type type; /* 24 4 */ u32 max_grouping; /* 28 4 */ bool simple_ch_prep_sm; /* 32 1 */ /* XXX 3 bytes hole, try to pack */ u32 ch_prep_timeout; /* 36 4 */ u32 imp_def_interrupts; /* 40 4 */ u32 max_ch; /* 44 4 */ u32 min_ch; /* 48 4 */ u32 num_channels; /* 52 4 */ u32 * channels; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ u32 num_ch_combinations; /* 64 4 */ /* XXX 4 bytes hole, try to pack */ u32 * ch_combinations; /* 72 8 */ u32 modes; /* 80 4 */ u32 max_async_buffer; /* 84 4 */ bool block_pack_mode; /* 88 1 */ bool read_only_wordlength; /* 89 1 */ /* XXX 2 bytes hole, try to pack */ u32 port_encoding; /* 92 4 */ struct sdw_dpn_audio_mode * audio_modes; /* 96 8 */ /* size: 104, cachelines: 2, members: 22 */ /* sum members: 95, holes: 3, sum holes: 9 */ /* last cacheline: 40 bytes */ }; after: struct sdw_dpn_prop { struct sdw_dpn_audio_mode * audio_modes; /* 0 8 */ u32 num; /* 8 4 */ u32 max_word; /* 12 4 */ u32 min_word; /* 16 4 */ u32 num_words; /* 20 4 */ u32 * words; /* 24 8 */ enum sdw_dpn_type type; /* 32 4 */ u32 max_grouping; /* 36 4 */ u32 ch_prep_timeout; /* 40 4 */ u32 imp_def_interrupts; /* 44 4 */ u32 max_ch; /* 48 4 */ u32 min_ch; /* 52 4 */ u32 num_channels; /* 56 4 */ u32 num_ch_combinations; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ u32 * channels; /* 64 8 */ u32 * ch_combinations; /* 72 8 */ u32 modes; /* 80 4 */ u32 max_async_buffer; /* 84 4 */ u32 port_encoding; /* 88 4 */ bool block_pack_mode; /* 92 1 */ bool read_only_wordlength; /* 93 1 */ bool simple_ch_prep_sm; /* 94 1 */ /* size: 96, cachelines: 2, members: 22 */ /* padding: 1 */ /* last cacheline: 32 bytes */ }; Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-8-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index c72095137a35..cc0afb8af333 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -288,6 +288,7 @@ struct sdw_dpn_audio_mode { /** * struct sdw_dpn_prop - Data Port DPn properties + * @audio_modes: Audio modes supported * @num: port number * @max_word: Maximum number of bits in a Payload Channel Sample, 1 to 64 * (inclusive) @@ -298,26 +299,26 @@ struct sdw_dpn_audio_mode { * @type: Data port type. Full, Simplified or Reduced * @max_grouping: Maximum number of samples that can be grouped together for * a full data port - * @simple_ch_prep_sm: If the port supports simplified channel prepare state - * machine * @ch_prep_timeout: Port-specific timeout value, in milliseconds * @imp_def_interrupts: If set, each bit corresponds to support for * implementation-defined interrupts * @max_ch: Maximum channels supported * @min_ch: Minimum channels supported * @num_channels: Number of discrete channels supported - * @channels: Discrete channels supported * @num_ch_combinations: Number of channel combinations supported + * @channels: Discrete channels supported * @ch_combinations: Channel combinations supported * @modes: SDW mode supported * @max_async_buffer: Number of samples that this port can buffer in * asynchronous modes + * @port_encoding: Payload Channel Sample encoding schemes supported * @block_pack_mode: Type of block port mode supported * @read_only_wordlength: Read Only wordlength field in DPN_BlockCtrl1 register - * @port_encoding: Payload Channel Sample encoding schemes supported - * @audio_modes: Audio modes supported + * @simple_ch_prep_sm: If the port supports simplified channel prepare state + * machine */ struct sdw_dpn_prop { + struct sdw_dpn_audio_mode *audio_modes; u32 num; u32 max_word; u32 min_word; @@ -325,21 +326,20 @@ struct sdw_dpn_prop { u32 *words; enum sdw_dpn_type type; u32 max_grouping; - bool simple_ch_prep_sm; u32 ch_prep_timeout; u32 imp_def_interrupts; u32 max_ch; u32 min_ch; u32 num_channels; - u32 *channels; u32 num_ch_combinations; + u32 *channels; u32 *ch_combinations; u32 modes; u32 max_async_buffer; + u32 port_encoding; bool block_pack_mode; bool read_only_wordlength; - u32 port_encoding; - struct sdw_dpn_audio_mode *audio_modes; + bool simple_ch_prep_sm; }; /** -- cgit v1.2.3 From 1ae4aa59d79399be0591c8d78c44e280406e2c34 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:44 +0800 Subject: soundwire: mipi-disco: remove DPn audio-modes The concept of DPn audio-modes was never used by anyone, and was removed from the DisCo for SoundWire 2.0 specification. Remove the definitions and TODO. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-9-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index cc0afb8af333..66feaa79ecfc 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -254,41 +254,8 @@ struct sdw_dp0_prop { bool imp_def_interrupts; }; -/** - * struct sdw_dpn_audio_mode - Audio mode properties for DPn - * @bus_min_freq: Minimum bus frequency, in Hz - * @bus_max_freq: Maximum bus frequency, in Hz - * @bus_num_freq: Number of discrete frequencies supported - * @bus_freq: Discrete bus frequencies, in Hz - * @min_freq: Minimum sampling frequency, in Hz - * @max_freq: Maximum sampling bus frequency, in Hz - * @num_freq: Number of discrete sampling frequency supported - * @freq: Discrete sampling frequencies, in Hz - * @prep_ch_behave: Specifies the dependencies between Channel Prepare - * sequence and bus clock configuration - * If 0, Channel Prepare can happen at any Bus clock rate - * If 1, Channel Prepare sequence shall happen only after Bus clock is - * changed to a frequency supported by this mode or compatible modes - * described by the next field - * @glitchless: Bitmap describing possible glitchless transitions from this - * Audio Mode to other Audio Modes - */ -struct sdw_dpn_audio_mode { - u32 bus_min_freq; - u32 bus_max_freq; - u32 bus_num_freq; - u32 *bus_freq; - u32 max_freq; - u32 min_freq; - u32 num_freq; - u32 *freq; - u32 prep_ch_behave; - u32 glitchless; -}; - /** * struct sdw_dpn_prop - Data Port DPn properties - * @audio_modes: Audio modes supported * @num: port number * @max_word: Maximum number of bits in a Payload Channel Sample, 1 to 64 * (inclusive) @@ -318,7 +285,6 @@ struct sdw_dpn_audio_mode { * machine */ struct sdw_dpn_prop { - struct sdw_dpn_audio_mode *audio_modes; u32 num; u32 max_word; u32 min_word; -- cgit v1.2.3 From 543bd28a3bfeff31f748ba83348b63313dd37ff9 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:49 +0800 Subject: soundwire: mipi-disco: add new properties from 2.0 spec The DisCo for SoundWire 2.0 spec adds support for new 'mipi-sdw-sdca-interrupt-register-list' and 'mipi-sdw-commit-register-supported'. This patch only adds the definitions and property reads, but the use of these properties will come at some point in the future when needed. Note a slight conceptual disconnect between the MIPI DisCo definition of a boolean property and the Linux implementation. The latter only checks the presence of the property to set its value to 'true', whereas the MIPI definitions allow for a property with a 'false' value. This patch uses the new introduced mipi_device_property_read_bool() to handle it. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-14-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 66feaa79ecfc..952514f044f0 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -335,8 +335,11 @@ struct sdw_dpn_prop { * @master_count: Number of Masters present on this Slave * @source_ports: Bitmap identifying source ports * @sink_ports: Bitmap identifying sink ports - * @scp_int1_mask: SCP_INT1_MASK desired settings * @quirks: bitmask identifying deltas from the MIPI specification + * @sdca_interrupt_register_list: indicates which sets of SDCA interrupt status + * and masks are supported + * @commit_register_supported: is PCP_Commit register supported + * @scp_int1_mask: SCP_INT1_MASK desired settings * @clock_reg_supported: the Peripheral implements the clock base and scale * registers introduced with the SoundWire 1.2 specification. SDCA devices * do not need to set this boolean property as the registers are required. @@ -363,6 +366,8 @@ struct sdw_slave_prop { u32 source_ports; u32 sink_ports; u32 quirks; + u32 sdca_interrupt_register_list; + u8 commit_register_supported; u8 scp_int1_mask; bool clock_reg_supported; bool use_domain_irq; -- cgit v1.2.3 From 71b405b184449fffcb76ea0814104b71dfdb2aee Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 3 Oct 2024 15:06:50 +0800 Subject: soundwire: mipi-disco: add support for DP0/DPn 'lane-list' property The SoundWire specification did not clearly require that ports could use all Lanes. Some SoundWire/SDCA peripheral adopters added restrictions on which lanes can be used by what port, and the DisCo for SoundWire 2.1 specification added a 'lane-list' property to model this hardware limitation. When not specified, the ports can use all Lanes. Otherwise, the 'lane-list' indicates which Lanes can be used, sorted by order of preference (most-preferred-first). This patch only reads the properties, the use of this property will come at a later time with multi-lane support. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241003070650.62787-15-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 952514f044f0..73f655334fe9 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -238,6 +238,8 @@ enum sdw_clk_stop_mode { * @simple_ch_prep_sm: If channel prepare sequence is required * @imp_def_interrupts: If set, each bit corresponds to support for * implementation-defined interrupts + * @num_lanes: array size of @lane_list + * @lane_list: indicates which Lanes can be used by DP0 * * The wordlengths are specified by Spec as max, min AND number of * discrete values, implementation can define based on the wordlengths they @@ -252,6 +254,8 @@ struct sdw_dp0_prop { bool BRA_flow_controlled; bool simple_ch_prep_sm; bool imp_def_interrupts; + int num_lanes; + u32 *lane_list; }; /** @@ -275,6 +279,8 @@ struct sdw_dp0_prop { * @num_ch_combinations: Number of channel combinations supported * @channels: Discrete channels supported * @ch_combinations: Channel combinations supported + * @lane_list: indicates which Lanes can be used by DPn + * @num_lanes: array size of @lane_list * @modes: SDW mode supported * @max_async_buffer: Number of samples that this port can buffer in * asynchronous modes @@ -300,6 +306,8 @@ struct sdw_dpn_prop { u32 num_ch_combinations; u32 *channels; u32 *ch_combinations; + u32 *lane_list; + int num_lanes; u32 modes; u32 max_async_buffer; u32 port_encoding; -- cgit v1.2.3 From 5b3fdc9f2ff10d3cee106ddaa0ee6636c7de381e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:29 +0200 Subject: random: Do not include in Files that use prandom infrastructure are now converted to use header instead of . Remove the legacy inclusion of from . This is the "nice cleanup" part, wished in c0842fbc1b18. Signed-off-by: Uros Bizjak Fixes: c0842fbc1b18 ("random32: move the pseudo-random 32-bit definitions to prandom.h") Cc: Theodore Ts'o Cc: Jason A. Donenfeld Cc: Linus Torvalds Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index b0a940af4fff..333cecfca93f 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -145,13 +145,6 @@ declare_get_random_var_wait(u64, u32) declare_get_random_var_wait(long, unsigned long) #undef declare_get_random_var -/* - * This is designed to be standalone for just prandom - * users, but for now we include it from - * for legacy reasons. - */ -#include - #ifdef CONFIG_SMP int random_prepare_cpu(unsigned int cpu); int random_online_cpu(unsigned int cpu); -- cgit v1.2.3 From d18c13697b4dcbf6a8f06c3d8e564c4f5ad1477c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:30 +0200 Subject: prandom: Include in include was removed from in d9f29deb7fe8 ("prandom: Remove unused include") because this inclusion broke arm64 due to a circular dependency on include files. __percpu tag is defined in include/linux/compiler_types.h, so there is currently no direct need for the inclusion of . However, in [1] we would like to repurpose __percpu tag as a named address space qualifier, where __percpu macro uses defines from . The circular dependency was removed in ddd8e37ebaa1 ("random: Do not include in ") and it cleared the path for the inclusion of in . This patch is basically a revert of d9f29deb7fe8 ("prandom: Remove unused include"). [1] https://lore.kernel.org/lkml/20240812115945.484051-4-ubizjak@gmail.com/ Signed-off-by: Uros Bizjak Cc: Theodore Ts'o Cc: Jason A. Donenfeld Cc: Kent Overstreet Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index f7f1e5251c67..f2ed5b72b3d6 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -10,6 +10,7 @@ #include #include +#include #include struct rnd_state { -- cgit v1.2.3 From 816ad8f1e498fe5e9e992da137316302219f2137 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:52 -0700 Subject: lib: packing: remove kernel-doc from header file It is not necessary to have the kernel-doc duplicated both in the header and in the implementation. It is better to have it near the implementation of the function, since in C, a function can have N declarations, but only one definition. Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-3-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- include/linux/packing.h | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/packing.h b/include/linux/packing.h index 8d6571feb95d..69baefebcd02 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -17,32 +17,6 @@ enum packing_op { UNPACK, }; -/** - * packing - Convert numbers (currently u64) between a packed and an unpacked - * format. Unpacked means laid out in memory in the CPU's native - * understanding of integers, while packed means anything else that - * requires translation. - * - * @pbuf: Pointer to a buffer holding the packed value. - * @uval: Pointer to an u64 holding the unpacked value. - * @startbit: The index (in logical notation, compensated for quirks) where - * the packed value starts within pbuf. Must be larger than, or - * equal to, endbit. - * @endbit: The index (in logical notation, compensated for quirks) where - * the packed value ends within pbuf. Must be smaller than, or equal - * to, startbit. - * @op: If PACK, then uval will be treated as const pointer and copied (packed) - * into pbuf, between startbit and endbit. - * If UNPACK, then pbuf will be treated as const pointer and the logical - * value between startbit and endbit will be copied (unpacked) to uval. - * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and - * QUIRK_MSB_ON_THE_RIGHT. - * - * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming - * correct usage, return code may be discarded. - * If op is PACK, pbuf is modified. - * If op is UNPACK, uval is modified. - */ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, enum packing_op op, u8 quirks); -- cgit v1.2.3 From 7263f64e16d90ded44ce211381b2def83db32fd9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:53 -0700 Subject: lib: packing: add pack() and unpack() wrappers over packing() Geert Uytterhoeven described packing() as "really bad API" because of not being able to enforce const correctness. The same function is used both when "pbuf" is input and "uval" is output, as in the other way around. Create 2 wrapper functions where const correctness can be ensured. Do ugly type casts inside, to be able to reuse packing() as currently implemented - which will _not_ modify the input argument. Also, take the opportunity to change the type of startbit and endbit to size_t - an unsigned type - in these new function prototypes. When int, an extra check for negative values is necessary. Hopefully, when packing() goes away completely, that check can be dropped. My concern is that code which does rely on the conditional directionality of packing() is harder to refactor without blowing up in size. So it may take a while to completely eliminate packing(). But let's make alternatives available for those who do not need that. Link: https://lore.kernel.org/netdev/20210223112003.2223332-1-geert+renesas@glider.be/ Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-4-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- include/linux/packing.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/packing.h b/include/linux/packing.h index 69baefebcd02..ea25cb93cc70 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -20,4 +20,10 @@ enum packing_op { int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, enum packing_op op, u8 quirks); +int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks); + +int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, + size_t pbuflen, u8 quirks); + #endif -- cgit v1.2.3 From 28aec9ca29f05dabb835293acc6e202bf51b8092 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Oct 2024 14:51:54 -0700 Subject: lib: packing: duplicate pack() and unpack() implementations packing() is now used in some hot paths, and it would be good to get rid of some ifs and buts that depend on "op", to speed things up a little bit. With the main implementations now taking size_t endbit, we no longer have to check for negative values. Update the local integer variables to also be size_t to match. Signed-off-by: Vladimir Oltean Signed-off-by: Jacob Keller Reviewed-by: Przemek Kitszel Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241002-packing-kunit-tests-and-split-pack-unpack-v2-5-8373e551eae3@intel.com Signed-off-by: Jakub Kicinski --- include/linux/packing.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/packing.h b/include/linux/packing.h index ea25cb93cc70..5d36dcd06f60 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -20,8 +20,8 @@ enum packing_op { int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, enum packing_op op, u8 quirks); -int pack(void *pbuf, const u64 *uval, size_t startbit, size_t endbit, - size_t pbuflen, u8 quirks); +int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, + u8 quirks); int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, size_t pbuflen, u8 quirks); -- cgit v1.2.3 From da7d71bcb0637b7aa18934628fdb5a55f2db49a6 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 16 Sep 2024 02:17:11 -0700 Subject: bpf: Use KF_FASTCALL to mark kfuncs supporting fastcall contract In order to allow pahole add btf_decl_tag("bpf_fastcall") for kfuncs supporting bpf_fastcall, mark such functions with KF_FASTCALL in id_set8 objects. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240916091712.2929279-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index b8a583194c4a..631060e3ad14 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -75,6 +75,7 @@ #define KF_ITER_NEXT (1 << 9) /* kfunc implements BPF iter next method */ #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */ #define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */ +#define KF_FASTCALL (1 << 12) /* kfunc supports bpf_fastcall protocol */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the -- cgit v1.2.3 From a04d5f82fa3893aa06386db93883b151b93b11ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 05:01:05 +0100 Subject: mm: Remove PageMappedToDisk All callers have now been converted to the folio APIs, so remove the page API for this flag. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20241002040111.1023018-4-willy@infradead.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1b3a76710487..35d08c30d4a6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -554,7 +554,7 @@ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) */ TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) -PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) +FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) -- cgit v1.2.3 From fd15ba4cb00a43fb4df42e1f95f94857ad122eea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 05:01:07 +0100 Subject: ceph: Remove call to PagePrivate2() Use the folio that we already have to call folio_test_private_2() instead. This is the last call to PagePrivate2(), so replace its PAGEFLAG() definition with FOLIO_FLAG(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20241002040111.1023018-6-willy@infradead.org Signed-off-by: Christian Brauner --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 35d08c30d4a6..4c2dfe289046 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -543,7 +543,7 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) -PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) +FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE) /* owner_2 can be set on tail pages for anon memory */ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) -- cgit v1.2.3 From 0e45a09a1da0872786885c505467aab8fb29b5b4 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 4 Sep 2024 21:17:06 -0700 Subject: Input: serio - define serio_pause_rx guard to pause and resume serio ports serio_pause_rx() and serio_continue_rx() are usually used together to temporarily stop receiving interrupts/data for a given serio port. Define "serio_pause_rx" guard for this so that the port is always resumed once critical section is over. Example: scoped_guard(serio_pause_rx, elo->serio) { elo->expected_packet = toupper(packet[0]); init_completion(&elo->cmd_done); } Link: https://lore.kernel.org/r/20240905041732.2034348-2-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- include/linux/serio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serio.h b/include/linux/serio.h index bf2191f25350..69a47674af65 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -6,6 +6,7 @@ #define _SERIO_H +#include #include #include #include @@ -161,4 +162,6 @@ static inline void serio_continue_rx(struct serio *serio) spin_unlock_irq(&serio->lock); } +DEFINE_GUARD(serio_pause_rx, struct serio *, serio_pause_rx(_T), serio_continue_rx(_T)) + #endif -- cgit v1.2.3 From c653ffc283404a6c1c0e65143a833180c7ff799b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 3 Oct 2024 07:46:51 -0700 Subject: HID: stop exporting hid_snto32() The only user of hid_snto32() is Logitech HID++ driver, which always calls hid_snto32() with valid size (constant, either 12 or 8) and therefore can simply use sign_extend32(). Make the switch and remove hid_snto32(). Move snto32() and s32ton() to avoid introducing forward declaration. Signed-off-by: Dmitry Torokhov Link: https://patch.msgid.link/20241003144656.3786064-2-dmitry.torokhov@gmail.com [bentiss: fix checkpatch warning] Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 121d5b8bc867..f7c3a66647fd 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -974,7 +974,6 @@ const struct hid_device_id *hid_match_device(struct hid_device *hdev, struct hid_driver *hdrv); bool hid_compare_device_paths(struct hid_device *hdev_a, struct hid_device *hdev_b, char separator); -s32 hid_snto32(__u32 value, unsigned n); __u32 hid_field_extract(const struct hid_device *hid, __u8 *report, unsigned offset, unsigned n); -- cgit v1.2.3 From 9ab515b18f8463fbb340fece47cd461809e42a9d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:41 +0100 Subject: mm: Define VM_HIGH_ARCH_6 The addition of protection keys means that on arm64 we now use all of the currently defined VM_HIGH_ARCH_x bits. In order to allow us to allocate a new flag for GCS pages define VM_HIGH_ARCH_6. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-2-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ecf63d2b0582..182bad0c55df 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -329,12 +329,14 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS -- cgit v1.2.3 From 91e102e79740ae43ded050ccac71aa3371db4f33 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:43 +0100 Subject: prctl: arch-agnostic prctl for shadow stack Three architectures (x86, aarch64, riscv) have announced support for shadow stacks with fairly similar functionality. While x86 is using arch_prctl() to control the functionality neither arm64 nor riscv uses that interface so this patch adds arch-agnostic prctl() support to get and set status of shadow stacks and lock the current configuation to prevent further changes, with support for turning on and off individual subfeatures so applications can limit their exposure to features that they do not need. The features are: - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks, including allocation of a shadow stack if one is not already allocated. - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow stack. - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack. These features are expected to be inherited by new threads and cleared on exec(), unknown features should be rejected for enable but accepted for locking (in order to allow for future proofing). This is based on a patch originally written by Deepak Gupta but modified fairly heavily, support for indirect landing pads is removed, additional modes added and the locking interface reworked. The set status prctl() is also reworked to just set flags, if setting/reading the shadow stack pointer is required this could be a separate prctl. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Reviewed-by: Deepak Gupta Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-4-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 182bad0c55df..56654306a832 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4221,4 +4221,8 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) } #endif /* CONFIG_MEM_ALLOC_PROFILING */ +int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); +int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); +int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From ae80e1629aeaf5be726a9ea94eb7345b1a44b00d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:53 +0100 Subject: mm: Define VM_SHADOW_STACK for arm64 when we support GCS Use VM_HIGH_ARCH_5 for guarded control stack pages. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-14-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- include/linux/mm.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 56654306a832..8852c39c7695 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -367,7 +367,17 @@ extern unsigned int kobjsize(const void *objp); * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#else +#endif + +#if defined(CONFIG_ARM64_GCS) +/* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ +# define VM_SHADOW_STACK VM_HIGH_ARCH_6 +#endif + +#ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif -- cgit v1.2.3 From b63c755cb65d43c8aba987c4f6b57c77c6f123f2 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 30 Sep 2024 14:29:53 +0100 Subject: appletalk: Remove deadcode alloc_ltalkdev in net/appletalk/dev.c is dead since commit 00f3696f7555 ("net: appletalk: remove cops support") Removing it (and it's helper) leaves dev.c and if_ltalk.h empty; remove them and the Makefile entry. tun.c was including that if_ltalk.h but actually wanted the uapi version for LTALK_ALEN, fix up the path. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- include/linux/if_ltalk.h | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 include/linux/if_ltalk.h (limited to 'include/linux') diff --git a/include/linux/if_ltalk.h b/include/linux/if_ltalk.h deleted file mode 100644 index 4cc1c0b77870..000000000000 --- a/include/linux/if_ltalk.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_LTALK_H -#define __LINUX_LTALK_H - -#include - -extern struct net_device *alloc_ltalkdev(int sizeof_priv); -#endif -- cgit v1.2.3 From ec841b8d73cff37f8960e209017efe1eb2fb21f2 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 23 Sep 2024 16:12:01 +0800 Subject: usb: chipidea: add CI_HDRC_HAS_SHORT_PKT_LIMIT flag Currently, the imx deivice controller has below limitations: 1. can't generate short packet interrupt if IOC not set in dTD. So if one request span more than one dTDs and only the last dTD set IOC, the usb request will pending there if no more data comes. 2. the controller can't accurately deliver data to differtent usb requests in some cases due to short packet. For example: one usb request span 3 dTDs, then if the controller received a short packet the next packet will go to 2nd dTD of current request rather than the first dTD of next request. 3. can't build a bus packet use multiple dTDs. For example: controller needs to send one packet of 512 bytes use dTD1 (200 bytes) + dTD2 (312 bytes), actually the host side will see 200 bytes short packet. Based on these limits, add CI_HDRC_HAS_SHORT_PKT_LIMIT flag and use it on imx platforms. Signed-off-by: Xu Yang Acked-by: Peter Chen Link: https://lore.kernel.org/r/20240923081203.2851768-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/chipidea.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h index 5a7f96684ea2..ebdfef124b2b 100644 --- a/include/linux/usb/chipidea.h +++ b/include/linux/usb/chipidea.h @@ -65,6 +65,7 @@ struct ci_hdrc_platform_data { #define CI_HDRC_PHY_VBUS_CONTROL BIT(16) #define CI_HDRC_HAS_PORTSC_PEC_MISSED BIT(17) #define CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS BIT(18) +#define CI_HDRC_HAS_SHORT_PKT_LIMIT BIT(19) enum usb_dr_mode dr_mode; #define CI_HDRC_CONTROLLER_RESET_EVENT 0 #define CI_HDRC_CONTROLLER_STOPPED_EVENT 1 -- cgit v1.2.3 From 8b7fd6a15f8c32760c2026a62dcf55219b4da15b Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 1 Oct 2024 16:30:05 +0200 Subject: HID: bpf: move HID-BPF report descriptor fixup earlier Currently, hid_bpf_rdesc_fixup() is called once the match between the HID device and the driver is done. This can be problematic in case the driver selected by the kernel would change the report descriptor after the fact. To give a chance for hid_bpf_rdesc_fixup() to provide hints on to how to select a dedicated driver or not, move the call to that BPF hook earlier in the .probe() process, when we get the first match. However, this means that we might get called more than once (typically once for hid-generic, and once for hid-vendor-specific). So we store the result of HID-BPF fixup in struct hid_device. Basically, this means that ->bpf_rdesc can replace ->dev_rdesc when it was used in the code. In order to not grow struct hid_device, some fields are re-ordered. This was the output of pahole for the first 128 bytes: struct hid_device { __u8 * dev_rdesc; /* 0 8 */ unsigned int dev_rsize; /* 8 4 */ /* XXX 4 bytes hole, try to pack */ __u8 * rdesc; /* 16 8 */ unsigned int rsize; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct hid_collection * collection; /* 32 8 */ unsigned int collection_size; /* 40 4 */ unsigned int maxcollection; /* 44 4 */ unsigned int maxapplication; /* 48 4 */ __u16 bus; /* 52 2 */ __u16 group; /* 54 2 */ __u32 vendor; /* 56 4 */ __u32 product; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ __u32 version; /* 64 4 */ enum hid_type type; /* 68 4 */ unsigned int country; /* 72 4 */ /* XXX 4 bytes hole, try to pack */ struct hid_report_enum report_enum[3]; /* 80 6216 */ Basically, we got three holes of 4 bytes. We can reorder things a little and makes those 3 holes a continuous 12 bytes hole, which can be replaced by the new pointer and the new unsigned int we need. In terms of code allocation, when not using HID-BPF, we are back to kernel v6.2 in hid_open_report(). These multiple kmemdup() calls will be fixed in a later commit. Link: https://patch.msgid.link/20241001-hid-bpf-hid-generic-v3-1-2ef1019468df@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 18 ++++++++++-------- include/linux/hid_bpf.h | 11 +++-------- 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 121d5b8bc867..ff58b5ceb62e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -599,15 +599,17 @@ enum hid_battery_status { struct hid_driver; struct hid_ll_driver; -struct hid_device { /* device report descriptor */ - const __u8 *dev_rdesc; - unsigned dev_rsize; - const __u8 *rdesc; - unsigned rsize; +struct hid_device { + const __u8 *dev_rdesc; /* device report descriptor */ + const __u8 *bpf_rdesc; /* bpf modified report descriptor, if any */ + const __u8 *rdesc; /* currently used report descriptor */ + unsigned int dev_rsize; + unsigned int bpf_rsize; + unsigned int rsize; + unsigned int collection_size; /* Number of allocated hid_collections */ struct hid_collection *collection; /* List of HID collections */ - unsigned collection_size; /* Number of allocated hid_collections */ - unsigned maxcollection; /* Number of parsed collections */ - unsigned maxapplication; /* Number of applications */ + unsigned int maxcollection; /* Number of parsed collections */ + unsigned int maxapplication; /* Number of applications */ __u16 bus; /* BUS ID */ __u16 group; /* Report group */ __u32 vendor; /* Vendor ID */ diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index 6a47223e6460..a6876ab29004 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -212,7 +212,7 @@ int hid_bpf_connect_device(struct hid_device *hdev); void hid_bpf_disconnect_device(struct hid_device *hdev); void hid_bpf_destroy_device(struct hid_device *hid); int hid_bpf_device_init(struct hid_device *hid); -u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); +const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 *size, int interrupt, @@ -228,13 +228,8 @@ static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; } static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {} static inline void hid_bpf_destroy_device(struct hid_device *hid) {} static inline int hid_bpf_device_init(struct hid_device *hid) { return 0; } -/* - * This specialized allocator has to be a macro for its allocations to be - * accounted separately (to have a separate alloc_tag). The typecast is - * intentional to enforce typesafety. - */ -#define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size) \ - ((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL)) +static inline const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, + unsigned int *size) { return rdesc; } #endif /* CONFIG_HID_BPF */ -- cgit v1.2.3 From 645c224ac5f6e0013931c342ea707b398d24d410 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 1 Oct 2024 16:30:12 +0200 Subject: HID: add per device quirk to force bind to hid-generic We already have the possibility to force not binding to hid-generic and rely on a dedicated driver, but we couldn't do the other way around. This is useful for BPF programs where we are fixing the report descriptor and the events, but want to avoid a specialized driver to come after BPF which would unwind everything that is done there. Reviewed-by: Peter Hutterer Link: https://patch.msgid.link/20241001-hid-bpf-hid-generic-v3-8-2ef1019468df@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index ff58b5ceb62e..63330d623335 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -359,6 +359,7 @@ struct hid_item { * | @HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP: * | @HID_QUIRK_HAVE_SPECIAL_DRIVER: * | @HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE: + * | @HID_QUIRK_IGNORE_SPECIAL_DRIVER * | @HID_QUIRK_FULLSPEED_INTERVAL: * | @HID_QUIRK_NO_INIT_REPORTS: * | @HID_QUIRK_NO_IGNORE: @@ -384,6 +385,7 @@ struct hid_item { #define HID_QUIRK_HAVE_SPECIAL_DRIVER BIT(19) #define HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE BIT(20) #define HID_QUIRK_NOINVERT BIT(21) +#define HID_QUIRK_IGNORE_SPECIAL_DRIVER BIT(22) #define HID_QUIRK_FULLSPEED_INTERVAL BIT(28) #define HID_QUIRK_NO_INIT_REPORTS BIT(29) #define HID_QUIRK_NO_IGNORE BIT(30) -- cgit v1.2.3 From 5acd957a986c167d357e617a887630203be29ea4 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 1 Oct 2024 13:37:04 +0300 Subject: net/mlx5: hw counters: Make fc_stats & fc_pool private The mlx5_fc_stats and mlx5_fc_pool structs are only used from fs_counters.c. As such, make them private there. mlx5_fc_pool is not used or referenced at all outside fs_counters. mlx5_fc_stats is referenced from mlx5_core_dev, so instead of having it as a direct member (which requires exporting it from fs_counters), store a pointer to it, allocate it on init and clear it on destroy. One caveat is that a simple container_of to get from a 'work' struct to the outermost mlx5_core_dev struct directly no longer works, so an extra pointer had to be added to mlx5_fc_stats back to the parent dev. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241001103709.58127-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e23c692a34c7..fc7e6153b73d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -45,7 +45,6 @@ #include #include #include -#include #include #include #include @@ -474,36 +473,6 @@ struct mlx5_core_sriov { u16 max_ec_vfs; }; -struct mlx5_fc_pool { - struct mlx5_core_dev *dev; - struct mutex pool_lock; /* protects pool lists */ - struct list_head fully_used; - struct list_head partially_used; - struct list_head unused; - int available_fcs; - int used_fcs; - int threshold; -}; - -struct mlx5_fc_stats { - spinlock_t counters_idr_lock; /* protects counters_idr */ - struct idr counters_idr; - struct list_head counters; - struct llist_head addlist; - struct llist_head dellist; - - struct workqueue_struct *wq; - struct delayed_work work; - unsigned long next_query; - unsigned long sampling_interval; /* jiffies */ - u32 *bulk_query_out; - int bulk_query_len; - size_t num_counters; - bool bulk_query_alloc_failed; - unsigned long next_bulk_query_alloc; - struct mlx5_fc_pool fc_pool; -}; - struct mlx5_events; struct mlx5_mpfs; struct mlx5_eswitch; @@ -630,7 +599,7 @@ struct mlx5_priv { struct mlx5_devcom_comp_dev *hca_devcom_comp; struct mlx5_fw_reset *fw_reset; struct mlx5_core_roce roce; - struct mlx5_fc_stats fc_stats; + struct mlx5_fc_stats *fc_stats; struct mlx5_rl_table rl_table; struct mlx5_ft_pool *ft_pool; -- cgit v1.2.3 From d1c9cffe4b01f4d8bc52169139a5fedd48908abc Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 1 Oct 2024 13:37:09 +0300 Subject: net/mlx5: hw counters: Remove mlx5_fc_create_ex It no longer serves any purpose and is identical to mlx5_fc_create upon which it was originally based of. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241001103709.58127-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index b744e554f014..438db888bde0 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -298,9 +298,6 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); -/* As mlx5_fc_create() but doesn't queue stats refresh thread. */ -struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging); - void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, -- cgit v1.2.3 From f858cc9eed5b05cbe38d7ffd2787c21e3718eb7d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2024 12:12:18 +0000 Subject: net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patch adds dev->max_pacing_offload_horizon expressing this timing wheel horizon in nsec units. This is a read-only attribute. Unless a driver sets it, dev->max_pacing_offload_horizon is zero. v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 ) v3: added yaml doc (also per Jakub feedback) Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..49a7e7db0883 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2009,6 +2009,8 @@ enum netdev_reg_state { * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * + * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2399,6 +2401,8 @@ struct net_device { /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; + u64 max_pacing_offload_horizon; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; -- cgit v1.2.3 From 1e562deacecca1f1bec7d23da526904a1e87525e Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 10 Sep 2024 16:30:16 +0200 Subject: crypto: rsassa-pkcs1 - Migrate to sig_alg backend A sig_alg backend has just been introduced with the intent of moving all asymmetric sign/verify algorithms to it one by one. Migrate the sign/verify operations from rsa-pkcs1pad.c to a separate rsassa-pkcs1.c which uses the new backend. Consequently there are now two templates which build on the "rsa" akcipher_alg: * The existing "pkcs1pad" template, which is instantiated as an akcipher_instance and retains the encrypt/decrypt operations of RSAES-PKCS1-v1_5 (RFC 8017 sec 7.2). * The new "pkcs1" template, which is instantiated as a sig_instance and contains the sign/verify operations of RSASSA-PKCS1-v1_5 (RFC 8017 sec 8.2). In a separate step, rsa-pkcs1pad.c could optionally be renamed to rsaes-pkcs1.c for clarity. Additional "oaep" and "pss" templates could be added for RSAES-OAEP and RSASSA-PSS. Note that it's currently allowed to allocate a "pkcs1pad(rsa)" transform without specifying a hash algorithm. That makes sense if the transform is only used for encrypt/decrypt and continues to be supported. But for sign/verify, such transforms previously did not insert the Full Hash Prefix into the padding. The resulting message encoding was incompliant with EMSA-PKCS1-v1_5 (RFC 8017 sec 9.2) and therefore nonsensical. From here on in, it is no longer allowed to allocate a transform without specifying a hash algorithm if the transform is used for sign/verify operations. This simplifies the code because the insertion of the Full Hash Prefix is no longer optional, so various "if (digest_info)" clauses can be removed. There has been a previous attempt to forbid transform allocation without specifying a hash algorithm, namely by commit c0d20d22e0ad ("crypto: rsa-pkcs1pad - Require hash to be present"). It had to be rolled back with commit b3a8c8a5ebb5 ("crypto: rsa-pkcs1pad: Allow hash to be optional [ver #2]"), presumably because it broke allocation of a transform which was solely used for encrypt/decrypt, not sign/verify. Avoid such breakage by allowing transform allocation for encrypt/decrypt with and without specifying a hash algorithm (and simply ignoring the hash algorithm in the former case). So again, specifying a hash algorithm is now mandatory for sign/verify, but optional and ignored for encrypt/decrypt. The new sig_alg API uses kernel buffers instead of sglists, which avoids the overhead of copying signature and digest from sglists back into kernel buffers. rsassa-pkcs1.c is thus simplified quite a bit. sig_alg is always synchronous, whereas the underlying "rsa" akcipher_alg may be asynchronous. So await the result of the akcipher_alg, similar to crypto_akcipher_sync_{en,de}crypt(). As part of the migration, rename "rsa_digest_info" to "hash_prefix" to adhere to the spec language in RFC 9580. Otherwise keep the code unmodified wherever possible to ease reviewing and bisecting. Leave several simplification and hardening opportunities to separate commits. rsassa-pkcs1.c uses modern __free() syntax for allocation of buffers which need to be freed by kfree_sensitive(), hence a DEFINE_FREE() clause for kfree_sensitive() is introduced herein as a byproduct. Signed-off-by: Lukas Wunner Signed-off-by: Herbert Xu --- include/linux/slab.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index b35e2db7eb0e..0268ea7abf8b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -448,6 +448,7 @@ void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) +DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T)) /** * ksize - Report actual allocation size of associated object -- cgit v1.2.3 From 4df86c6ea5c37fe0452638f39a1e4b189da75c54 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 10 Sep 2024 16:30:23 +0200 Subject: ASN.1: Clean up include statements in public headers If is the first header included from a .c file (due to headers being sorted alphabetically), the compiler complains: include/linux/asn1_decoder.h:18:29: error: unknown type name 'size_t' Avoid by including . Jonathan notes that the counterpart already includes , but additionally includes the unnecessary . Drop it. Signed-off-by: Lukas Wunner Reviewed-by: Stefan Berger Reviewed-by: Jonathan Cameron Signed-off-by: Herbert Xu --- include/linux/asn1_decoder.h | 1 + include/linux/asn1_encoder.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/asn1_decoder.h b/include/linux/asn1_decoder.h index 83f9c6e1e5e9..b41bce82a191 100644 --- a/include/linux/asn1_decoder.h +++ b/include/linux/asn1_decoder.h @@ -9,6 +9,7 @@ #define _LINUX_ASN1_DECODER_H #include +#include struct asn1_decoder; diff --git a/include/linux/asn1_encoder.h b/include/linux/asn1_encoder.h index 08cd0c2ad34f..d17484dffb74 100644 --- a/include/linux/asn1_encoder.h +++ b/include/linux/asn1_encoder.h @@ -6,7 +6,6 @@ #include #include #include -#include #define asn1_oid_len(oid) (sizeof(oid)/sizeof(u32)) unsigned char * -- cgit v1.2.3 From 452c55dcefa958f7e87798d764497485687e4bc3 Mon Sep 17 00:00:00 2001 From: Chenghai Huang Date: Sun, 29 Sep 2024 19:26:57 +0800 Subject: crypto: hisilicon/qm - fix the coding specifications issue Ensure that the inline function contains no more than 10 lines. move q_num_set() from hisi_acc_qm.h to qm.c. Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 9d7754ad5e9b..389e95754776 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -436,37 +436,6 @@ struct hisi_qp { struct uacce_queue *uacce_q; }; -static inline int q_num_set(const char *val, const struct kernel_param *kp, - unsigned int device) -{ - struct pci_dev *pdev; - u32 n, q_num; - int ret; - - if (!val) - return -EINVAL; - - pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, device, NULL); - if (!pdev) { - q_num = min_t(u32, QM_QNUM_V1, QM_QNUM_V2); - pr_info("No device found currently, suppose queue number is %u\n", - q_num); - } else { - if (pdev->revision == QM_HW_V1) - q_num = QM_QNUM_V1; - else - q_num = QM_QNUM_V2; - - pci_dev_put(pdev); - } - - ret = kstrtou32(val, 10, &n); - if (ret || n < QM_MIN_QNUM || n > q_num) - return -EINVAL; - - return param_set_int(val, kp); -} - static inline int vfs_num_set(const char *val, const struct kernel_param *kp) { u32 n; @@ -526,6 +495,8 @@ static inline void hisi_qm_del_list(struct hisi_qm *qm, struct hisi_qm_list *qm_ mutex_unlock(&qm_list->lock); } +int hisi_qm_q_num_set(const char *val, const struct kernel_param *kp, + unsigned int device); int hisi_qm_init(struct hisi_qm *qm); void hisi_qm_uninit(struct hisi_qm *qm); int hisi_qm_start(struct hisi_qm *qm); -- cgit v1.2.3 From 21e92806d39c68af2accd1fb238c2daecfcf9fbd Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Sat, 14 Sep 2024 20:29:12 -0700 Subject: function_graph: Support recording and printing the function return address When using function_graph tracer to analyze the flow of kernel function execution, it is often necessary to quickly locate the exact line of code where the call occurs. While this may be easy at times, it can be more time-consuming when some functions are inlined or the flow is too long. This feature aims to simplify the process by recording the return address of traced funcions and printing it when outputing trace logs. To enhance human readability, the prefix 'ret=' is used for the kernel return value, while '<-' serves as the prefix for the return address in trace logs to make it look more like the function tracer. A new trace option named 'funcgraph-retaddr' has been introduced, and the existing option 'sym-addr' can be used to control the format of the return address. See below logs with both funcgraph-retval and funcgraph-retaddr enabled. 0) | load_elf_binary() { /* <-bprm_execve+0x249/0x600 */ 0) | load_elf_phdrs() { /* <-load_elf_binary+0x84/0x1730 */ 0) | __kmalloc_noprof() { /* <-load_elf_phdrs+0x4a/0xb0 */ 0) 3.657 us | __cond_resched(); /* <-__kmalloc_noprof+0x28c/0x390 ret=0x0 */ 0) + 24.335 us | } /* __kmalloc_noprof ret=0xffff8882007f3000 */ 0) | kernel_read() { /* <-load_elf_phdrs+0x6c/0xb0 */ 0) | rw_verify_area() { /* <-kernel_read+0x2b/0x50 */ 0) | security_file_permission() { /* <-kernel_read+0x2b/0x50 */ 0) | selinux_file_permission() { /* <-security_file_permission+0x26/0x40 */ 0) | __inode_security_revalidate() { /* <-selinux_file_permission+0x6d/0x140 */ 0) 2.034 us | __cond_resched(); /* <-__inode_security_revalidate+0x5f/0x80 ret=0x0 */ 0) 6.602 us | } /* __inode_security_revalidate ret=0x0 */ 0) 2.214 us | avc_policy_seqno(); /* <-selinux_file_permission+0x107/0x140 ret=0x0 */ 0) + 16.670 us | } /* selinux_file_permission ret=0x0 */ 0) + 20.809 us | } /* security_file_permission ret=0x0 */ 0) + 25.217 us | } /* rw_verify_area ret=0x0 */ 0) | __kernel_read() { /* <-load_elf_phdrs+0x6c/0xb0 */ 0) | ext4_file_read_iter() { /* <-__kernel_read+0x160/0x2e0 */ Then, we can use the faddr2line to locate the source code, for example: $ ./scripts/faddr2line ./vmlinux load_elf_phdrs+0x6c/0xb0 load_elf_phdrs+0x6c/0xb0: elf_read at fs/binfmt_elf.c:471 (inlined by) load_elf_phdrs at fs/binfmt_elf.c:531 Link: https://lore.kernel.org/20240915032912.1118397-1-dolinux.peng@gmail.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409150605.HgUmU8ea-lkp@intel.com/ Signed-off-by: Donglin Peng [ Rebased to handle text_delta offsets ] Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e684addf6508..2ac3b3b53cd0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1040,6 +1040,17 @@ struct ftrace_graph_ent { int depth; } __packed; +/* + * Structure that defines an entry function trace with retaddr. + * It's already packed but the attribute "packed" is needed + * to remove extra padding at the end. + */ +struct fgraph_retaddr_ent { + unsigned long func; /* Current function */ + int depth; + unsigned long retaddr; /* Return address */ +} __packed; + /* * Structure that defines a return function trace. * It's already packed but the attribute "packed" is needed @@ -1057,19 +1068,29 @@ struct ftrace_graph_ret { unsigned long long rettime; } __packed; +struct fgraph_extras; struct fgraph_ops; /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, struct fgraph_ops *); /* return */ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, - struct fgraph_ops *); /* entry */ + struct fgraph_ops *, + struct fgraph_extras *); /* entry */ -extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); +extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct fgraph_extras *extras); bool ftrace_pids_enabled(struct ftrace_ops *ops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* Used to convey some extra datas when creating a graph entry */ +struct fgraph_extras { + u32 flags; + unsigned long retaddr; +}; + struct fgraph_ops { trace_func_graph_ent_t entryfunc; trace_func_graph_ret_t retfunc; @@ -1115,6 +1136,8 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); unsigned long *fgraph_get_task_var(struct fgraph_ops *gops); +u32 graph_tracer_flags_get(u32 flags); + /* * Sometimes we don't want to trace a function with the function * graph tracer but we want them to keep traced by the usual function -- cgit v1.2.3 From 70c8fd00a9bd0509bbf7bccd9baea8bbd5ddc756 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:16 -0400 Subject: timekeeping: Add interfaces for handling timestamps with a floor value Multigrain timestamps allow the kernel to use fine-grained timestamps when an inode's attributes is being actively observed via ->getattr(). With this support, it's possible for a file to get a fine-grained timestamp, and another modified after it to get a coarse-grained stamp that is earlier than the fine-grained time. If this happens then the files can appear to have been modified in reverse order, which breaks VFS ordering guarantees [1]. To prevent this, maintain a floor value for multigrain timestamps. Whenever a fine-grained timestamp is handed out, record it, and when later coarse-grained stamps are handed out, ensure they are not earlier than that value. If the coarse-grained timestamp is earlier than the fine-grained floor, return the floor value instead. Add a static singleton atomic64_t into timekeeper.c that is used to keep track of the latest fine-grained time ever handed out. This is tracked as a monotonic ktime_t value to ensure that it isn't affected by clock jumps. Because it is updated at different times than the rest of the timekeeper object, the floor value is managed independently of the timekeeper via a cmpxchg() operation, and sits on its own cacheline. Add two new public interfaces: - ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the coarse-grained clock and the floor time - ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries to swap it into the floor. A timespec64 is filled with the result. The floor value is global and updated via a single try_cmpxchg(). If that fails then the operation raced with a concurrent update. Any concurrent update must be later than the existing floor value, so any racing tasks can accept any resulting floor value without retrying. [1]: POSIX requires that files be stamped with realtime clock values, and makes no provision for dealing with backward clock jumps. If a backward realtime clock jump occurs, then files can appear to have been modified in reverse order. Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap # documentation bits Acked-by: John Stultz Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org --- include/linux/timekeeping.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fc12a9ba2c88..7aa85246c183 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -45,6 +45,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); +/* Multigrain timestamp interfaces */ +extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); +extern void ktime_get_real_ts64_mg(struct timespec64 *ts); + void getboottime64(struct timespec64 *ts); /* -- cgit v1.2.3 From 96f9a366ec8abe027326d7aab84d64370019f0f1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:17 -0400 Subject: timekeeping: Add percpu counter for tracking floor swap events The mgtime_floor value is a global variable for tracking the latest fine-grained timestamp handed out. Because it's a global, track the number of times that a new floor value is assigned. Add a new percpu counter to the timekeeping code to track the number of floor swap events that have occurred. A later patch will add a debugfs file to display this counter alongside other stats involving multigrain timestamps. Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap # documentation bits Link: https://lore.kernel.org/all/20241002-mgtime-v10-2-d1c4717f5284@kernel.org --- include/linux/timekeeping.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7aa85246c183..84a035e86ac8 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -48,6 +48,7 @@ extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); extern void ktime_get_real_ts64_mg(struct timespec64 *ts); +extern unsigned long timekeeping_get_mg_floor_swaps(void); void getboottime64(struct timespec64 *ts); -- cgit v1.2.3 From 2382d68d7d43873ba856baf567cab0d5c523f23b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Sep 2024 15:31:38 +1000 Subject: sched: change wake_up_bit() and related function to expect unsigned long * wake_up_bit() currently allows a "void *". While this isn't strictly a problem as the address is never dereferenced, it is inconsistent with the corresponding wait_on_bit() which requires "unsigned long *" and does dereference the pointer. Any code that needs to wait for a change in something other than an unsigned long would be better served by wake_up_var()/wait_var_event(). This patch changes all related "void *" to "unsigned long *". Reported-by: Linus Torvalds Signed-off-by: NeilBrown Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240925053405.3960701-2-neilb@suse.de --- include/linux/wait_bit.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 7725b7579b78..48e123839892 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -8,7 +8,7 @@ #include struct wait_bit_key { - void *flags; + unsigned long *flags; int bit_nr; unsigned long timeout; }; @@ -23,14 +23,14 @@ struct wait_bit_queue_entry { typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit); +void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit); int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); -void wake_up_bit(void *word, int bit); -int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode); -int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); -int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); -struct wait_queue_head *bit_waitqueue(void *word, int bit); +void wake_up_bit(unsigned long *word, int bit); +int out_of_line_wait_on_bit(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); +int out_of_line_wait_on_bit_timeout(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); +int out_of_line_wait_on_bit_lock(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); +struct wait_queue_head *bit_waitqueue(unsigned long *word, int bit); extern void __init wait_bit_init(void); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); @@ -327,7 +327,7 @@ do { \ * You can use this helper if bitflags are manipulated atomically rather than * non-atomically under a lock. */ -static inline void clear_and_wake_up_bit(int bit, void *word) +static inline void clear_and_wake_up_bit(int bit, unsigned long *word) { clear_bit_unlock(bit, word); /* See wake_up_bit() for which memory barrier you need to use. */ -- cgit v1.2.3 From 3cdee6b359f134da22f7fd4606e0338413cfd79e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Sep 2024 15:31:39 +1000 Subject: sched: Improve documentation for wake_up_bit/wait_on_bit family of functions This patch revises the documention for wake_up_bit(), clear_and_wake_up_bit(), and all the wait_on_bit() family of functions. The new documentation places less emphasis on the pool of waitqueues used (an implementation detail) and focuses instead on details of how the functions behave. The barriers included in the wait functions and clear_and_wake_up_bit() and those required for wake_up_bit() are spelled out more clearly. The error statuses returned are given explicitly. The fact that the wait_on_bit_lock() function sets the bit is made more obvious. Signed-off-by: NeilBrown Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240925053405.3960701-3-neilb@suse.de --- include/linux/wait_bit.h | 159 +++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 48e123839892..723e7bf35747 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -53,19 +53,21 @@ extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode); /** * wait_on_bit - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on * @mode: the task state to sleep in * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that waits on a bit. - * For instance, if one were to have waiters on a bitflag, one would - * call wait_on_bit() in threads waiting for the bit to clear. - * One uses wait_on_bit() where one is waiting for the bit to clear, - * but has no intention of setting it. - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. + * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) + * to be cleared. The clearing of the bit must be signalled with + * wake_up_bit(), often as clear_and_wake_up_bit(). + * + * The process will wait on a waitqueue selected by hash from a shared + * pool. It will only be woken on a wake_up for the target bit, even + * if other processes on the same queue are waiting for other bits. + * + * Returned value will be zero if the bit was cleared in which case the + * call has ACQUIRE semantics, or %-EINTR if the process received a + * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) @@ -80,17 +82,20 @@ wait_on_bit(unsigned long *word, int bit, unsigned mode) /** * wait_on_bit_io - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on * @mode: the task state to sleep in * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared. This is similar to wait_on_bit(), but calls - * io_schedule() instead of schedule() for the actual waiting. + * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) + * to be cleared. The clearing of the bit must be signalled with + * wake_up_bit(), often as clear_and_wake_up_bit(). + * + * This is similar to wait_on_bit(), but calls io_schedule() instead of + * schedule() for the actual waiting. * - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. + * Returned value will be zero if the bit was cleared in which case the + * call has ACQUIRE semantics, or %-EINTR if the process received a + * signal and the mode permitted wake up on that signal. */ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) @@ -104,19 +109,24 @@ wait_on_bit_io(unsigned long *word, int bit, unsigned mode) } /** - * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wait_on_bit_timeout - wait for a bit to be cleared or a timeout to elapse + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on * @mode: the task state to sleep in * @timeout: timeout, in jiffies * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared. This is similar to wait_on_bit(), except also takes a - * timeout parameter. + * Wait for the given bit in an unsigned long or bitmap (see + * DECLARE_BITMAP()) to be cleared, or for a timeout to expire. The + * clearing of the bit must be signalled with wake_up_bit(), often as + * clear_and_wake_up_bit(). * - * Returned value will be zero if the bit was cleared before the - * @timeout elapsed, or non-zero if the @timeout elapsed or process - * received a signal and the mode permitted wakeup on that signal. + * This is similar to wait_on_bit(), except it also takes a timeout + * parameter. + * + * Returned value will be zero if the bit was cleared in which case the + * call has ACQUIRE semantics, or %-EINTR if the process received a + * signal and the mode permitted wake up on that signal, or %-EAGAIN if the + * timeout elapsed. */ static inline int wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, @@ -132,19 +142,21 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, /** * wait_on_bit_action - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * @word: the address containing the bit waited on + * @bit: the bit at that address being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared, and allow the waiting action to be specified. - * This is like wait_on_bit() but allows fine control of how the waiting - * is done. + * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) + * to be cleared. The clearing of the bit must be signalled with + * wake_up_bit(), often as clear_and_wake_up_bit(). + * + * This is similar to wait_on_bit(), but calls @action() instead of + * schedule() for the actual waiting. * - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. + * Returned value will be zero if the bit was cleared in which case the + * call has ACQUIRE semantics, or the error code returned by @action if + * that call returned non-zero. */ static inline int wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, @@ -157,23 +169,22 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, } /** - * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wait_on_bit_lock - wait for a bit to be cleared, then set it + * @word: the address containing the bit being waited on + * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that waits on a bit - * when one intends to set it, for instance, trying to lock bitflags. - * For instance, if one were to have waiters trying to set bitflag - * and waiting for it to clear before setting it, one would call - * wait_on_bit() in threads waiting to be able to set the bit. - * One uses wait_on_bit_lock() where one is waiting for the bit to - * clear with the intention of setting it, and when done, clearing it. + * Wait for the given bit in an unsigned long or bitmap (see + * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be + * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As + * soon as it is clear, atomically set it and return. * - * Returns zero if the bit was (eventually) found to be clear and was - * set. Returns non-zero if a signal was delivered to the process and - * the @mode allows that signal to wake the process. + * This is similar to wait_on_bit(), but sets the bit before returning. + * + * Returned value will be zero if the bit was successfully set in which + * case the call has the same memory sequencing semantics as + * test_and_clear_bit(), or %-EINTR if the process received a signal and + * the mode permitted wake up on that signal. */ static inline int wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) @@ -185,15 +196,18 @@ wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) } /** - * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wait_on_bit_lock_io - wait for a bit to be cleared, then set it + * @word: the address containing the bit being waited on + * @bit: the bit of the word being waited on and set * @mode: the task state to sleep in * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared and then to atomically set it. This is similar - * to wait_on_bit(), but calls io_schedule() instead of schedule() - * for the actual waiting. + * Wait for the given bit in an unsigned long or bitmap (see + * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be + * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As + * soon as it is clear, atomically set it and return. + * + * This is similar to wait_on_bit_lock(), but calls io_schedule() instead + * of schedule(). * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and @@ -209,21 +223,19 @@ wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) } /** - * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wait_on_bit_lock_action - wait for a bit to be cleared, then set it + * @word: the address containing the bit being waited on + * @bit: the bit of the word being waited on and set * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared and then to set it, and allow the waiting action - * to be specified. - * This is like wait_on_bit() but allows fine control of how the waiting - * is done. + * This is similar to wait_on_bit_lock(), but calls @action() instead of + * schedule() for the actual waiting. * - * Returns zero if the bit was (eventually) found to be clear and was - * set. Returns non-zero if a signal was delivered to the process and - * the @mode allows that signal to wake the process. + * Returned value will be zero if the bit was successfully set in which + * case the call has the same memory sequencing semantics as + * test_and_clear_bit(), or the error code returned by @action if that + * call returned non-zero. */ static inline int wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, @@ -320,12 +332,13 @@ do { \ /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit - * * @bit: the bit of the word being waited on - * @word: the word being waited on, a kernel virtual address + * @word: the address containing the bit being waited on * - * You can use this helper if bitflags are manipulated atomically rather than - * non-atomically under a lock. + * The designated bit is cleared and any tasks waiting in wait_on_bit() + * or similar will be woken. This call has RELEASE semantics so that + * any changes to memory made before this call are guaranteed to be visible + * after the corresponding wait_on_bit() completes. */ static inline void clear_and_wake_up_bit(int bit, unsigned long *word) { -- cgit v1.2.3 From bf39882edc798279765ca31751f6e679b50b97ef Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Sep 2024 15:31:40 +1000 Subject: sched: Document wait_var_event() family of functions and wake_up_var() wake_up_var(), wait_var_event() and related interfaces are not documented but have important ordering requirements. This patch adds documentation and makes these requirements explicit. The return values for those wait_var_event_* functions which return a value are documented. Note that these are, perhaps surprisingly, sometimes different from comparable wait_on_bit() functions. Signed-off-by: NeilBrown Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240925053405.3960701-4-neilb@suse.de --- include/linux/wait_bit.h | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 723e7bf35747..06ec99b90bf3 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -282,6 +282,22 @@ __out: __ret; \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) +/** + * wait_var_event - wait for a variable to be updated and notified + * @var: the address of variable being waited on + * @condition: the condition to wait for + * + * Wait for a @condition to be true, only re-checking when a wake up is + * received for the given @var (an arbitrary kernel address which need + * not be directly related to the given condition, but usually is). + * + * The process will wait on a waitqueue selected by hash from a shared + * pool. It will only be woken on a wake_up for the given address. + * + * The condition should normally use smp_load_acquire() or a similarly + * ordered access to ensure that any changes to memory made before the + * condition became true will be visible after the wait completes. + */ #define wait_var_event(var, condition) \ do { \ might_sleep(); \ @@ -294,6 +310,24 @@ do { \ ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ schedule()) +/** + * wait_var_event_killable - wait for a variable to be updated and notified + * @var: the address of variable being waited on + * @condition: the condition to wait for + * + * Wait for a @condition to be true or a fatal signal to be received, + * only re-checking the condition when a wake up is received for the given + * @var (an arbitrary kernel address which need not be directly related + * to the given condition, but usually is). + * + * This is similar to wait_var_event() but returns a value which is + * 0 if the condition became true, or %-ERESTARTSYS if a fatal signal + * was received. + * + * The condition should normally use smp_load_acquire() or a similarly + * ordered access to ensure that any changes to memory made before the + * condition became true will be visible after the wait completes. + */ #define wait_var_event_killable(var, condition) \ ({ \ int __ret = 0; \ @@ -308,6 +342,26 @@ do { \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) +/** + * wait_var_event_timeout - wait for a variable to be updated or a timeout to expire + * @var: the address of variable being waited on + * @condition: the condition to wait for + * @timeout: maximum time to wait in jiffies + * + * Wait for a @condition to be true or a timeout to expire, only + * re-checking the condition when a wake up is received for the given + * @var (an arbitrary kernel address which need not be directly related + * to the given condition, but usually is). + * + * This is similar to wait_var_event() but returns a value which is 0 if + * the timeout expired and the condition was still false, or the + * remaining time left in the timeout (but at least 1) if the condition + * was found to be true. + * + * The condition should normally use smp_load_acquire() or a similarly + * ordered access to ensure that any changes to memory made before the + * condition became true will be visible after the wait completes. + */ #define wait_var_event_timeout(var, condition, timeout) \ ({ \ long __ret = timeout; \ @@ -321,6 +375,23 @@ do { \ ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) +/** + * wait_var_event_killable - wait for a variable to be updated and notified + * @var: the address of variable being waited on + * @condition: the condition to wait for + * + * Wait for a @condition to be true or a signal to be received, only + * re-checking the condition when a wake up is received for the given + * @var (an arbitrary kernel address which need not be directly related + * to the given condition, but usually is). + * + * This is similar to wait_var_event() but returns a value which is 0 if + * the condition became true, or %-ERESTARTSYS if a signal was received. + * + * The condition should normally use smp_load_acquire() or a similarly + * ordered access to ensure that any changes to memory made before the + * condition became true will be visible after the wait completes. + */ #define wait_var_event_interruptible(var, condition) \ ({ \ int __ret = 0; \ -- cgit v1.2.3 From 52d633def56c10fe3e82a2c5d88c3ecb3f4e4852 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Sep 2024 15:31:41 +1000 Subject: sched: Add test_and_clear_wake_up_bit() and atomic_dec_and_wake_up() There are common patterns in the kernel of using test_and_clear_bit() before wake_up_bit(), and atomic_dec_and_test() before wake_up_var(). These combinations don't need extra barriers but sometimes include them unnecessarily. To help avoid the unnecessary barriers and to help discourage the general use of wake_up_bit/var (which is a fragile interface) introduce two combined functions which implement these patterns. Also add store_release_wake_up() which supports the task of simply setting a non-atomic variable and sending a wakeup. This pattern requires barriers which are often omitted. Signed-off-by: NeilBrown Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240925053405.3960701-5-neilb@suse.de --- include/linux/wait_bit.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 06ec99b90bf3..0272629b590a 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -419,4 +419,64 @@ static inline void clear_and_wake_up_bit(int bit, unsigned long *word) wake_up_bit(word, bit); } +/** + * test_and_clear_wake_up_bit - clear a bit if it was set: wake up anyone waiting on that bit + * @bit: the bit of the word being waited on + * @word: the address of memory containing that bit + * + * If the bit is set and can be atomically cleared, any tasks waiting in + * wait_on_bit() or similar will be woken. This call has the same + * complete ordering semantics as test_and_clear_bit(). Any changes to + * memory made before this call are guaranteed to be visible after the + * corresponding wait_on_bit() completes. + * + * Returns %true if the bit was successfully set and the wake up was sent. + */ +static inline bool test_and_clear_wake_up_bit(int bit, unsigned long *word) +{ + if (!test_and_clear_bit(bit, word)) + return false; + /* no extra barrier required */ + wake_up_bit(word, bit); + return true; +} + +/** + * atomic_dec_and_wake_up - decrement an atomic_t and if zero, wake up waiters + * @var: the variable to dec and test + * + * Decrements the atomic variable and if it reaches zero, send a wake_up to any + * processes waiting on the variable. + * + * This function has the same complete ordering semantics as atomic_dec_and_test. + * + * Returns %true is the variable reaches zero and the wake up was sent. + */ + +static inline bool atomic_dec_and_wake_up(atomic_t *var) +{ + if (!atomic_dec_and_test(var)) + return false; + /* No extra barrier required */ + wake_up_var(var); + return true; +} + +/** + * store_release_wake_up - update a variable and send a wake_up + * @var: the address of the variable to be updated and woken + * @val: the value to store in the variable. + * + * Store the given value in the variable send a wake up to any tasks + * waiting on the variable. All necessary barriers are included to ensure + * the task calling wait_var_event() sees the new value and all values + * written to memory before this call. + */ +#define store_release_wake_up(var, val) \ +do { \ + smp_store_release(var, val); \ + smp_mb(); \ + wake_up_var(var); \ +} while (0) + #endif /* _LINUX_WAIT_BIT_H */ -- cgit v1.2.3 From cc2e1c82d7e474753681a38b07b63034e107e369 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Sep 2024 15:31:42 +1000 Subject: sched: Add wait/wake interface for variable updated under a lock. Sometimes we need to wait for a condition to be true which must be testing while holding a lock. Correspondingly the condition is made true while holding the lock and the wake up is sent under the lock. This patch provides wake and wait interfaces which can be used for this situation when the lock is a mutex or a spinlock, or any other lock for which there are foo_lock() and foo_unlock() functions. Signed-off-by: NeilBrown Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240925053405.3960701-6-neilb@suse.de --- include/linux/wait_bit.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 0272629b590a..6aea10efca3d 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -401,6 +401,112 @@ do { \ __ret; \ }) +/** + * wait_var_event_any_lock - wait for a variable to be updated under a lock + * @var: the address of the variable being waited on + * @condition: condition to wait for + * @lock: the object that is locked to protect updates to the variable + * @type: prefix on lock and unlock operations + * @state: waiting state, %TASK_UNINTERRUPTIBLE etc. + * + * Wait for a condition which can only be reliably tested while holding + * a lock. The variables assessed in the condition will normal be updated + * under the same lock, and the wake up should be signalled with + * wake_up_var_locked() under the same lock. + * + * This is similar to wait_var_event(), but assumes a lock is held + * while calling this function and while updating the variable. + * + * This must be called while the given lock is held and the lock will be + * dropped when schedule() is called to wait for a wake up, and will be + * reclaimed before testing the condition again. The functions used to + * unlock and lock the object are constructed by appending _unlock and _lock + * to @type. + * + * Return %-ERESTARTSYS if a signal arrives which is allowed to interrupt + * the wait according to @state. + */ +#define wait_var_event_any_lock(var, condition, lock, type, state) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __ret = ___wait_var_event(var, condition, state, 0, 0, \ + type ## _unlock(lock); \ + schedule(); \ + type ## _lock(lock)); \ + __ret; \ +}) + +/** + * wait_var_event_spinlock - wait for a variable to be updated under a spinlock + * @var: the address of the variable being waited on + * @condition: condition to wait for + * @lock: the spinlock which protects updates to the variable + * + * Wait for a condition which can only be reliably tested while holding + * a spinlock. The variables assessed in the condition will normal be updated + * under the same spinlock, and the wake up should be signalled with + * wake_up_var_locked() under the same spinlock. + * + * This is similar to wait_var_event(), but assumes a spinlock is held + * while calling this function and while updating the variable. + * + * This must be called while the given lock is held and the lock will be + * dropped when schedule() is called to wait for a wake up, and will be + * reclaimed before testing the condition again. + */ +#define wait_var_event_spinlock(var, condition, lock) \ + wait_var_event_any_lock(var, condition, lock, spin, TASK_UNINTERRUPTIBLE) + +/** + * wait_var_event_mutex - wait for a variable to be updated under a mutex + * @var: the address of the variable being waited on + * @condition: condition to wait for + * @mutex: the mutex which protects updates to the variable + * + * Wait for a condition which can only be reliably tested while holding + * a mutex. The variables assessed in the condition will normal be + * updated under the same mutex, and the wake up should be signalled + * with wake_up_var_locked() under the same mutex. + * + * This is similar to wait_var_event(), but assumes a mutex is held + * while calling this function and while updating the variable. + * + * This must be called while the given mutex is held and the mutex will be + * dropped when schedule() is called to wait for a wake up, and will be + * reclaimed before testing the condition again. + */ +#define wait_var_event_mutex(var, condition, lock) \ + wait_var_event_any_lock(var, condition, lock, mutex, TASK_UNINTERRUPTIBLE) + +/** + * wake_up_var_protected - wake up waiters for a variable asserting that it is safe + * @var: the address of the variable being waited on + * @cond: the condition which afirms this is safe + * + * When waking waiters which use wait_var_event_any_lock() the waker must be + * holding the reelvant lock to avoid races. This version of wake_up_var() + * asserts that the relevant lock is held and so no barrier is needed. + * The @cond is only tested when CONFIG_LOCKDEP is enabled. + */ +#define wake_up_var_protected(var, cond) \ +do { \ + lockdep_assert(cond); \ + wake_up_var(var); \ +} while (0) + +/** + * wake_up_var_locked - wake up waiters for a variable while holding a spinlock or mutex + * @var: the address of the variable being waited on + * @lock: The spinlock or mutex what protects the variable + * + * Send a wake up for the given variable which should be waited for with + * wait_var_event_spinlock() or wait_var_event_mutex(). Unlike wake_up_var(), + * no extra barriers are needed as the locking provides sufficient sequencing. + */ +#define wake_up_var_locked(var, lock) \ + wake_up_var_protected(var, lockdep_is_held(lock)) + /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit * @bit: the bit of the word being waited on -- cgit v1.2.3 From 80681c04c5e8e4297b9ebf201ca3ce6242aa16c3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Sep 2024 15:31:43 +1000 Subject: sched: add wait_var_event_io() It is not currently possible to wait wait_var_event for an io_schedule() style wait. This patch adds wait_var_event_io() for that purpose. Signed-off-by: NeilBrown Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240925053405.3960701-7-neilb@suse.de --- include/linux/wait_bit.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 6aea10efca3d..6346e26fbfd1 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -281,6 +281,9 @@ __out: __ret; \ #define __wait_var_event(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) +#define __wait_var_event_io(var, condition) \ + ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + io_schedule()) /** * wait_var_event - wait for a variable to be updated and notified @@ -306,6 +309,34 @@ do { \ __wait_var_event(var, condition); \ } while (0) +/** + * wait_var_event_io - wait for a variable to be updated and notified + * @var: the address of variable being waited on + * @condition: the condition to wait for + * + * Wait for an IO related @condition to be true, only re-checking when a + * wake up is received for the given @var (an arbitrary kernel address + * which need not be directly related to the given condition, but + * usually is). + * + * The process will wait on a waitqueue selected by hash from a shared + * pool. It will only be woken on a wake_up for the given address. + * + * This is similar to wait_var_event(), but calls io_schedule() instead + * of schedule(). + * + * The condition should normally use smp_load_acquire() or a similarly + * ordered access to ensure that any changes to memory made before the + * condition became true will be visible after the wait completes. + */ +#define wait_var_event_io(var, condition) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __wait_var_event_io(var, condition); \ +} while (0) + #define __wait_var_event_killable(var, condition) \ ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ schedule()) -- cgit v1.2.3 From 5e9f0c4819deb9459f32f12c4fd2b47993b8c395 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 30 Sep 2024 05:09:46 +0000 Subject: sched: remove unused __HAVE_THREAD_FUNCTIONS hook support __HAVE_THREAD_FUNCTIONS could be defined by architectures wishing to provide their own task_thread_info(), task_stack_page(), setup_thread_stack() and end_of_stack() hooks. Commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture") removed the last upstream consumer of __HAVE_THREAD_FUNCTIONS, so change the remaining !CONFIG_THREAD_INFO_IN_TASK && !__HAVE_THREAD_FUNCTIONS conditionals to only check for the former case. Signed-off-by: David Disseldorp Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ard Biesheuvel Link: https://lkml.kernel.org/r/20240930050945.30304-2-ddiss@suse.de --- include/linux/sched.h | 2 +- include/linux/sched/task_stack.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..abf26f1e1447 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1898,7 +1898,7 @@ extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) -#elif !defined(__HAVE_THREAD_FUNCTIONS) +#else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index bf10bdb487dd..2e52cc421bce 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -33,7 +33,7 @@ static __always_inline unsigned long *end_of_stack(const struct task_struct *tas #endif } -#elif !defined(__HAVE_THREAD_FUNCTIONS) +#else #define task_stack_page(task) ((void *)(task)->stack) -- cgit v1.2.3 From 0ac8f14ef22a1592b44dc90272aab35e43b0106a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 2 Oct 2024 00:40:16 +0100 Subject: sched/wait: Remove unused bit_wait_io_timeout bit_wait_io_timeout has been unused since 2016's commit 62906027091f ("mm: add PageWaiters indicating tasks are waiting for a page bit") Remove it. Signed-off-by: "Dr. David Alan Gilbert" Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tim Chen Link: https://lore.kernel.org/r/20241001234016.231696-1-linux@treblig.org --- include/linux/wait_bit.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 6346e26fbfd1..9e29d79fc790 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -49,7 +49,6 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync extern int bit_wait(struct wait_bit_key *key, int mode); extern int bit_wait_io(struct wait_bit_key *key, int mode); extern int bit_wait_timeout(struct wait_bit_key *key, int mode); -extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode); /** * wait_on_bit - wait for a bit to be cleared -- cgit v1.2.3 From 066c779b094b63754e0742ad8675d72d6c0a46f6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 3 Oct 2024 18:48:19 +0300 Subject: platform/x86: intel_scu_ipc: Don't use "proxy" headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20241003154819.1075141-1-andriy.shevchenko@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/intel_scu_ipc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/intel_scu_ipc.h b/include/linux/platform_data/x86/intel_scu_ipc.h index 0ca9962e97f2..b287627759f7 100644 --- a/include/linux/platform_data/x86/intel_scu_ipc.h +++ b/include/linux/platform_data/x86/intel_scu_ipc.h @@ -2,9 +2,13 @@ #ifndef __PLATFORM_X86_INTEL_SCU_IPC_H_ #define __PLATFORM_X86_INTEL_SCU_IPC_H_ +#include #include +#include struct device; +struct module; + struct intel_scu_ipc_dev; /** -- cgit v1.2.3 From 4e40eff0b5737c0de39e1ae5812509efbc0b986e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:18 -0400 Subject: fs: add infrastructure for multigrain timestamps The VFS has always used coarse-grained timestamps when updating the ctime and mtime after a change. This has the benefit of allowing filesystems to optimize away a lot metadata updates, down to around 1 per jiffy, even when a file is under heavy writes. Unfortunately, this has always been an issue when we're exporting via NFSv3, which relies on timestamps to validate caches. A lot of changes can happen in a jiffy, so timestamps aren't sufficient to help the client decide when to invalidate the cache. Even with NFSv4, a lot of exported filesystems don't properly support a change attribute and are subject to the same problems with timestamp granularity. Other applications have similar issues with timestamps (e.g backup applications). If fine-grained timestamps were always used, that would improve the situation, but that becomes rather expensive, as the underlying filesystem would have to log a lot more metadata updates. What is needed is a way to only use fine-grained timestamps when they are being actively queried. Use the (unused) top bit in inode->i_ctime_nsec as a flag that indicates whether the current timestamps have been queried via stat() or the like. When it's set, allow the update to use a fine-grained timestamp iff it's necessary to make the ctime show a different value. If it has been queried, then first see whether the current coarse time is later than the existing ctime. If it is, accept that value. If it isn't, then get a fine-grained timestamp and attempt to stamp the inode ctime with that value. If that races with another concurrent stamp, then abandon the update and take the new value without retrying. Filesystems can opt into this by setting the FS_MGTIME fstype flag. Others should be unaffected (other than being subject to the same floor value as multigrain filesystems). Tested-by: Randy Dunlap # documentation bits Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241002-mgtime-v10-3-d1c4717f5284@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6ca11e241a24..eff688e75f2f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1613,6 +1613,17 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode, return inode_set_mtime_to_ts(inode, ts); } +/* + * Multigrain timestamps + * + * Conditionally use fine-grained ctime and mtime timestamps when there + * are users actively observing them via getattr. The primary use-case + * for this is NFS clients that use the ctime to distinguish between + * different states of the file, and that are often fooled by multiple + * operations that occur in the same coarse-grained timer tick. + */ +#define I_CTIME_QUERIED ((u32)BIT(31)) + static inline time64_t inode_get_ctime_sec(const struct inode *inode) { return inode->i_ctime_sec; @@ -1620,7 +1631,7 @@ static inline time64_t inode_get_ctime_sec(const struct inode *inode) static inline long inode_get_ctime_nsec(const struct inode *inode) { - return inode->i_ctime_nsec; + return inode->i_ctime_nsec & ~I_CTIME_QUERIED; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) @@ -1631,13 +1642,7 @@ static inline struct timespec64 inode_get_ctime(const struct inode *inode) return ts; } -static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, - struct timespec64 ts) -{ - inode->i_ctime_sec = ts.tv_sec; - inode->i_ctime_nsec = ts.tv_nsec; - return ts; -} +struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts); /** * inode_set_ctime - set the ctime in the inode @@ -2500,6 +2505,7 @@ struct file_system_type { #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ +#define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -2523,6 +2529,17 @@ struct file_system_type { #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) +/** + * is_mgtime: is this inode using multigrain timestamps + * @inode: inode to test for multigrain timestamps + * + * Return true if the inode uses multigrain timestamps, false otherwise. + */ +static inline bool is_mgtime(const struct inode *inode) +{ + return inode->i_sb->s_type->fs_flags & FS_MGTIME; +} + extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); @@ -3262,6 +3279,7 @@ extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, -- cgit v1.2.3 From 8fd3395ec9051a52828fcca2328cb50a69dea8ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 Jul 2024 11:49:04 -0400 Subject: get rid of ...lookup...fdget_rcu() family Once upon a time, predecessors of those used to do file lookup without bumping a refcount, provided that caller held rcu_read_lock() across the lookup and whatever it wanted to read from the struct file found. When struct file allocation switched to SLAB_TYPESAFE_BY_RCU, that stopped being feasible and these primitives started to bump the file refcount for lookup result, requiring the caller to call fput() afterwards. But that turned them pointless - e.g. rcu_read_lock(); file = lookup_fdget_rcu(fd); rcu_read_unlock(); is equivalent to file = fget_raw(fd); and all callers of lookup_fdget_rcu() are of that form. Similarly, task_lookup_fdget_rcu() calls can be replaced with calling fget_task(). task_lookup_next_fdget_rcu() doesn't have direct counterparts, but its callers would be happier if we replaced it with an analogue that deals with RCU internally. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/fdtable.h | 4 ---- include/linux/file.h | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index b1c5722f2b3c..e25e2cb65d30 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -92,10 +92,6 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un return files_lookup_fd_raw(files, fd); } -struct file *lookup_fdget_rcu(unsigned int fd); -struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd); -struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd); - static inline bool close_on_exec(unsigned int fd, const struct files_struct *files) { return test_bit(fd, files_fdtable(files)->close_on_exec); diff --git a/include/linux/file.h b/include/linux/file.h index f98de143245a..ec4ad5e6a061 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -72,6 +72,7 @@ static inline void fdput(struct fd fd) extern struct file *fget(unsigned int fd); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); +extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd); extern void __f_unlock_pos(struct file *); struct fd fdget(unsigned int fd); -- cgit v1.2.3 From cab0515211f483e392d6862021ed008f49058561 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Jun 2024 17:48:36 -0400 Subject: move close_range(2) into fs/file.c, fold __close_range() into it We never had callers for __close_range() except for close_range(2) itself. Nothing of that sort has appeared in four years and if any users do show up, we can always separate those suckers again. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/fdtable.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e25e2cb65d30..c45306a9f007 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -111,7 +111,6 @@ int iterate_fd(struct files_struct *, unsigned, const void *); extern int close_fd(unsigned int fd); -extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern struct file *file_close_fd(unsigned int fd); extern struct kmem_cache *files_cachep; -- cgit v1.2.3 From a3f5f4c2f9b6bc2aa6f5a3e8e23b7519e4f2e3e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:20 +0000 Subject: ipv4: remove fib_info_devhash[] Upcoming per-netns RTNL conversion needs to get rid of shared hash tables. fib_info_devhash[] is one of them. It is unclear why we used a hash table, because a single hlist_head per net device was cheaper and scalable. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 49a7e7db0883..3baf8e539b6f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2211,6 +2211,9 @@ struct net_device { /* Protocol-specific pointers */ struct in_device __rcu *ip_ptr; + /** @fib_nh_head: nexthops associated with this netdev */ + struct hlist_head fib_nh_head; + #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif -- cgit v1.2.3 From 20a4da20e0bd0297bf1ce083362e78b6702f991e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 4 Oct 2024 11:01:00 +0200 Subject: net: phy: Add support for PHY timing-role configuration via device tree Introduce support for configuring the master/slave role of PHYs based on the `timing-role` property in the device tree. While this functionality is necessary for Single Pair Ethernet (SPE) PHYs (1000/100/10Base-T1) where hardware strap pins may be unavailable or incorrectly set, it works for any PHY type. Signed-off-by: Oleksij Rempel Reviewed-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Divya Koppera Signed-off-by: Paolo Abeni --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index a98bc91a0cde..ff762a3d8270 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1260,6 +1260,7 @@ size_t phy_speeds(unsigned int *speeds, size_t size, unsigned long *mask); void of_set_phy_supported(struct phy_device *phydev); void of_set_phy_eee_broken(struct phy_device *phydev); +void of_set_phy_timing_role(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); /** -- cgit v1.2.3 From 95397784be23f66c5d4280b0a4c4e0d1ee74f2ef Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 2 Sep 2024 10:42:31 +0200 Subject: media: staging: drop omap4iss The omap4 camera driver has seen no progress since forever, and now OMAP4 support has also been dropped from u-boot (1). So it is time to retire this driver. (1): https://lists.denx.de/pipermail/u-boot/2024-July/558846.html Signed-off-by: Hans Verkuil Acked-by: Laurent Pinchart --- include/linux/platform_data/media/omap4iss.h | 66 ---------------------------- 1 file changed, 66 deletions(-) delete mode 100644 include/linux/platform_data/media/omap4iss.h (limited to 'include/linux') diff --git a/include/linux/platform_data/media/omap4iss.h b/include/linux/platform_data/media/omap4iss.h deleted file mode 100644 index 2a511a8fcda7..000000000000 --- a/include/linux/platform_data/media/omap4iss.h +++ /dev/null @@ -1,66 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH_ARM_PLAT_OMAP4_ISS_H -#define ARCH_ARM_PLAT_OMAP4_ISS_H - -#include - -struct iss_device; - -enum iss_interface_type { - ISS_INTERFACE_CSI2A_PHY1, - ISS_INTERFACE_CSI2B_PHY2, -}; - -/** - * struct iss_csiphy_lane: CSI2 lane position and polarity - * @pos: position of the lane - * @pol: polarity of the lane - */ -struct iss_csiphy_lane { - u8 pos; - u8 pol; -}; - -#define ISS_CSIPHY1_NUM_DATA_LANES 4 -#define ISS_CSIPHY2_NUM_DATA_LANES 1 - -/** - * struct iss_csiphy_lanes_cfg - CSI2 lane configuration - * @data: Configuration of one or two data lanes - * @clk: Clock lane configuration - */ -struct iss_csiphy_lanes_cfg { - struct iss_csiphy_lane data[ISS_CSIPHY1_NUM_DATA_LANES]; - struct iss_csiphy_lane clk; -}; - -/** - * struct iss_csi2_platform_data - CSI2 interface platform data - * @crc: Enable the cyclic redundancy check - * @vpclk_div: Video port output clock control - */ -struct iss_csi2_platform_data { - unsigned crc:1; - unsigned vpclk_div:2; - struct iss_csiphy_lanes_cfg lanecfg; -}; - -struct iss_subdev_i2c_board_info { - struct i2c_board_info *board_info; - int i2c_adapter_id; -}; - -struct iss_v4l2_subdevs_group { - struct iss_subdev_i2c_board_info *subdevs; - enum iss_interface_type interface; - union { - struct iss_csi2_platform_data csi2; - } bus; /* gcc < 4.6.0 chokes on anonymous union initializers */ -}; - -struct iss_platform_data { - struct iss_v4l2_subdevs_group *subdevs; - void (*set_constraints)(struct iss_device *iss, bool enable); -}; - -#endif -- cgit v1.2.3 From ec763c234d7f60c5bce0fa2611ba79f5be1af76b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 4 Oct 2024 15:10:28 -0700 Subject: Revert "rtnetlink: add guard for RTNL" This reverts commit 464eb03c4a7cfb32cb3324249193cf6bb5b35152. Once we have a per-netns RTNL, we won't use guard(rtnl). Also, there's no users for now. $ grep -rnI "guard(rtnl" || true $ Suggested-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89i+KoYzUH+VPLdGmLABYf5y4TW0hrM4UAeQQJ9AREty0iw@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index a7da7dfc06a2..cdfc897f1e3c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -7,7 +7,6 @@ #include #include #include -#include #include extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); @@ -47,8 +46,6 @@ extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); -DEFINE_LOCK_GUARD_0(rtnl, rtnl_lock(), rtnl_unlock()) - extern wait_queue_head_t netdev_unregistering_wq; extern atomic_t dev_unreg_count; extern struct rw_semaphore pernet_ops_rwsem; -- cgit v1.2.3 From 76aed95319da25d6884dff01d5f0149e4b542f96 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 4 Oct 2024 15:10:29 -0700 Subject: rtnetlink: Add per-netns RTNL. The goal is to break RTNL down into per-netns mutex. This patch adds per-netns mutex and its helper functions, rtnl_net_lock() and rtnl_net_unlock(). rtnl_net_lock() acquires the global RTNL and per-netns RTNL mutex, and rtnl_net_unlock() releases them. We will replace 800+ rtnl_lock() with rtnl_net_lock() and finally removes rtnl_lock() in rtnl_net_lock(). When we need to nest per-netns RTNL mutex, we will use __rtnl_net_lock(), and its locking order is defined by rtnl_net_lock_cmp_fn() as follows: 1. init_net is first 2. netns address ascending order Note that the conversion will be done under CONFIG_DEBUG_NET_SMALL_RTNL with LOCKDEP so that we can carefully add the extra mutex without slowing down RTNL operations during conversion. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index cdfc897f1e3c..edd840a49989 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -92,6 +92,27 @@ static inline bool lockdep_rtnl_is_held(void) #define rcu_replace_pointer_rtnl(rp, p) \ rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net); +void __rtnl_net_unlock(struct net *net); +void rtnl_net_lock(struct net *net); +void rtnl_net_unlock(struct net *net); +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); +#else +static inline void __rtnl_net_lock(struct net *net) {} +static inline void __rtnl_net_unlock(struct net *net) {} + +static inline void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); +} + +static inline void rtnl_net_unlock(struct net *net) +{ + rtnl_unlock(); +} +#endif + static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); -- cgit v1.2.3 From 844e5e7e656d3a7a904fd5607f8491d6fd01db8e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 4 Oct 2024 15:10:30 -0700 Subject: rtnetlink: Add assertion helpers for per-netns RTNL. Once an RTNL scope is converted with rtnl_net_lock(), we will replace RTNL helper functions inside the scope with the following per-netns alternatives: ASSERT_RTNL() -> ASSERT_RTNL_NET(net) rcu_dereference_rtnl(p) -> rcu_dereference_rtnl_net(net, p) Note that the per-netns helpers are equivalent to the conventional helpers unless CONFIG_DEBUG_NET_SMALL_RTNL is enabled. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 45 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index edd840a49989..8468a4ce8510 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -51,6 +51,10 @@ extern atomic_t dev_unreg_count; extern struct rw_semaphore pernet_ops_rwsem; extern struct rw_semaphore net_rwsem; +#define ASSERT_RTNL() \ + WARN_ONCE(!rtnl_is_locked(), \ + "RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__) + #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); #else @@ -98,6 +102,22 @@ void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); + +bool rtnl_net_is_locked(struct net *net); + +#define ASSERT_RTNL_NET(net) \ + WARN_ONCE(!rtnl_net_is_locked(net), \ + "RTNL_NET: assertion failed at %s (%d)\n", \ + __FILE__, __LINE__) + +bool lockdep_rtnl_net_is_held(struct net *net); + +#define rcu_dereference_rtnl_net(net, p) \ + rcu_dereference_check(p, lockdep_rtnl_net_is_held(net)) +#define rtnl_net_dereference(net, p) \ + rcu_dereference_protected(p, lockdep_rtnl_net_is_held(net)) +#define rcu_replace_pointer_rtnl_net(net, rp, p) \ + rcu_replace_pointer(rp, p, lockdep_rtnl_net_is_held(net)) #else static inline void __rtnl_net_lock(struct net *net) {} static inline void __rtnl_net_unlock(struct net *net) {} @@ -111,6 +131,27 @@ static inline void rtnl_net_unlock(struct net *net) { rtnl_unlock(); } + +static inline void ASSERT_RTNL_NET(struct net *net) +{ + ASSERT_RTNL(); +} + +static inline void *rcu_dereference_rtnl_net(struct net *net, void *p) +{ + return rcu_dereference_rtnl(p); +} + +static inline void *rtnl_net_dereference(struct net *net, void *p) +{ + return rtnl_dereference(p); +} + +static inline void *rcu_replace_pointer_rtnl_net(struct net *net, + void *rp, void *p) +{ + return rcu_replace_pointer_rtnl(rp, p); +} #endif static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) @@ -140,10 +181,6 @@ void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); -#define ASSERT_RTNL() \ - WARN_ONCE(!rtnl_is_locked(), \ - "RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__) - extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, -- cgit v1.2.3 From 55b834873e800a5809787ce6aedcb70ee49a596f Mon Sep 17 00:00:00 2001 From: Dmitry Perchanov Date: Mon, 26 Aug 2024 16:05:04 +0300 Subject: media: uvcvideo: Add luma 16-bit interlaced pixel format The formats added by this patch are: UVC_GUID_FORMAT_Y16I Interlaced lumina format primary use in RealSense Depth cameras with stereo stream for left and right image sensors. Signed-off-by: Dmitry Perchanov Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/a717a912035b0a0f82b2f35719cca0c5269e995f.camel@intel.com Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- include/linux/usb/uvc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h index 88d96095bcb1..1c16be20c966 100644 --- a/include/linux/usb/uvc.h +++ b/include/linux/usb/uvc.h @@ -118,6 +118,9 @@ #define UVC_GUID_FORMAT_Y12I \ { 'Y', '1', '2', 'I', 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_Y16I \ + { 'Y', '1', '6', 'I', 0x00, 0x00, 0x10, 0x00, \ + 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} #define UVC_GUID_FORMAT_Z16 \ { 'Z', '1', '6', ' ', 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -- cgit v1.2.3 From a7e742e416bc331c0f409d6a3630471896e696d3 Mon Sep 17 00:00:00 2001 From: David Given Date: Wed, 18 Sep 2024 20:05:39 +0200 Subject: media: uvcvideo: Add support for the D3DFMT_R5G6B5 pixmap type This media format is used by the NXP Semiconductors 1fc9:009b chipset, used by the Kaiweets KTI-W02 infrared camera. Signed-off-by: David Given Reviewed-by: Ricardo Ribalda Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20240918180540.10830-1-dg@cowlark.com Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- include/linux/usb/uvc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h index 1c16be20c966..bce95153e5a6 100644 --- a/include/linux/usb/uvc.h +++ b/include/linux/usb/uvc.h @@ -143,6 +143,9 @@ #define UVC_GUID_FORMAT_D3DFMT_L8 \ {0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} +#define UVC_GUID_FORMAT_D3DFMT_R5G6B5 \ + {0x7b, 0xeb, 0x36, 0xe4, 0x4f, 0x52, 0xce, 0x11, \ + 0x9f, 0x53, 0x00, 0x20, 0xaf, 0x0b, 0xa7, 0x70} #define UVC_GUID_FORMAT_KSMEDIA_L8_IR \ {0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -- cgit v1.2.3 From 36e69b160705b65bf136c2fb6a1194447eeb8478 Mon Sep 17 00:00:00 2001 From: Dragan Simic Date: Sun, 29 Sep 2024 11:21:16 +0200 Subject: driver core: Add device probe log helper dev_warn_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some drivers can still provide their functionality to a certain extent even when some of their resource acquisitions eventually fail. In such cases, emitting errors isn't the desired action, but warnings should be emitted instead. To solve this, introduce dev_warn_probe() as a new device probe log helper, which behaves identically as the already existing dev_err_probe(), while it produces warnings instead of errors. The intended use is with the resources that are actually optional for a particular driver. While there, copyedit the kerneldoc for dev_err_probe() a bit, to simplify its wording a bit, and reuse it as the kerneldoc for dev_warn_probe(), with the necessary wording adjustments, of course. Signed-off-by: Dragan Simic Tested-by: Hélène Vulquin Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2be0a28538bb2a3d1bcc91e2ca1f2d0dc09146d9.1727601608.git.dsimic@manjaro.org Signed-off-by: Mark Brown --- include/linux/dev_printk.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index ca32b5bb28eb..eb2094e43050 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -276,6 +276,7 @@ do { \ dev_driver_string(dev), dev_name(dev), ## arg) __printf(3, 4) int dev_err_probe(const struct device *dev, int err, const char *fmt, ...); +__printf(3, 4) int dev_warn_probe(const struct device *dev, int err, const char *fmt, ...); /* Simple helper for dev_err_probe() when ERR_PTR() is to be returned. */ #define dev_err_ptr_probe(dev, ___err, fmt, ...) \ -- cgit v1.2.3 From 0b028ff7e70ecbe5240ad92e36a664af5cf7f382 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 6 Oct 2024 23:55:11 +0100 Subject: auxdisplay: Remove unused functions cfag12864b_getrate() and cfag12864b_isenabled() were both added in commit 70e840499aae ("[PATCH] drivers: add LCD support") but never used. Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Geert Uytterhoeven Acked-by: Miguel Ojeda Signed-off-by: Andy Shevchenko --- include/linux/cfag12864b.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h index 6617d9c68d86..83e6613d12ae 100644 --- a/include/linux/cfag12864b.h +++ b/include/linux/cfag12864b.h @@ -27,13 +27,6 @@ */ extern unsigned char * cfag12864b_buffer; -/* - * Get the refresh rate of the LCD - * - * Returns the refresh rate (hertz). - */ -extern unsigned int cfag12864b_getrate(void); - /* * Enable refreshing * @@ -49,16 +42,6 @@ extern unsigned char cfag12864b_enable(void); */ extern void cfag12864b_disable(void); -/* - * Is enabled refreshing? (is anyone using the module?) - * - * Returns 0 if refreshing is not enabled (anyone is using it), - * or != 0 if refreshing is enabled (someone is using it). - * - * Useful for buffer read-only modules. - */ -extern unsigned char cfag12864b_isenabled(void); - /* * Is the module inited? */ -- cgit v1.2.3 From 581434654e01ec79dd02c21448ac84e2ce2d1a64 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 8 Oct 2024 11:24:58 +0000 Subject: workqueue: Adjust WQ_MAX_ACTIVE from 512 to 2048 WQ_MAX_ACTIVE is currently set to 512, which was established approximately 15 yeas ago. However, with the significant increase in machine sizes and capabilities, the previous limit of 256 concurrent tasks is no longer sufficient. Therefore, we propose to increase WQ_MAX_ACTIVE to 2048. and WQ_DFL_ACTIVE is 1024 now. Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 59c2695e12e7..b0dc957c3e56 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -412,7 +412,7 @@ enum wq_flags { }; enum wq_consts { - WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ + WQ_MAX_ACTIVE = 2048, /* I like 2048, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, -- cgit v1.2.3 From aefa398d93d5db7c555be78a605ff015357f127d Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Wed, 2 Oct 2024 11:47:16 -0700 Subject: cgroup/rstat: Tracking cgroup-level niced CPU time Cgroup-level CPU statistics currently include time spent on user/system processes, but do not include niced CPU time (despite already being tracked). This patch exposes niced CPU time to the userspace, allowing users to get a better understanding of their hardware limits and can facilitate more informed workload distribution. A new field 'ntime' is added to struct cgroup_base_stat as opposed to struct task_cputime to minimize footprint. Signed-off-by: Joshua Hahn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 47ae4c4d924c..0a80ef9191a6 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -327,6 +327,7 @@ struct cgroup_base_stat { #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif + u64 ntime; }; /* -- cgit v1.2.3 From 49e4154f4b16345da5e219b23ed9737a6e735bc1 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 11 Sep 2024 09:00:26 +0800 Subject: tracing: Remove TRACE_EVENT_FL_FILTERED logic After commit dcb0b5575d24 ("tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic"), no one's going to set the TRACE_EVENT_FL_FILTERED or change the call->filter, so remove related logic. Link: https://lore.kernel.org/20240911010026.2302849-1-zhengyejian@huaweicloud.com Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 42bedcddd511..f8f2e52653df 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -326,7 +326,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { - TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, @@ -341,7 +340,6 @@ enum { /* * Event flags: - * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file @@ -356,7 +354,6 @@ enum { * to a tracepoint yet, then it is cleared when it is. */ enum { - TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), @@ -381,7 +378,6 @@ struct trace_event_call { }; struct trace_event event; char *print_fmt; - struct event_filter *filter; /* * Static events can disappear with modules, * where as dynamic ones need their own ref count. -- cgit v1.2.3 From 836265d31631e28000fc8917ce697fc687a58724 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:35:25 +0200 Subject: wifi: remove iw_public_data from struct net_device Given the previous patches, we no longer need the struct iw_public_data etc., it's only used by the old Intel drivers (and ps3_gelic creates it but then doesn't use it). Remove all of that, including the pointer in struct net_device. Link: https://patch.msgid.link/20241007213525.8b2d52b60531.I6a27aaf30bded9a0977f07f47fba2bd31a3b3330@changeid Signed-off-by: Johannes Berg --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87b5e488325..f0abed1e6e45 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1773,7 +1773,6 @@ enum netdev_reg_state { * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, * see for details. - * @wireless_data: Instance data managed by the core of wireless extensions * * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions @@ -2150,7 +2149,6 @@ struct net_device { #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def *wireless_handlers; - struct iw_public_data *wireless_data; #endif const struct ethtool_ops *ethtool_ops; #ifdef CONFIG_NET_L3_MASTER_DEV -- cgit v1.2.3 From 9e1a98aac11b757e2634304bbe6bed5f5a58e0ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:50:25 +0200 Subject: wifi: wext: merge adjacent CONFIG_COMPAT ifdef blocks Simplify this, and also add a comment at the #endif. Link: https://patch.msgid.link/20241007215025.5ecdad1e02ed.I54efa895efc496e06ba41e1c39c9df9e23b0171f@changeid Signed-off-by: Johannes Berg --- include/linux/wireless.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wireless.h b/include/linux/wireless.h index e6e34d74dda0..03e5d3fe226d 100644 --- a/include/linux/wireless.h +++ b/include/linux/wireless.h @@ -21,8 +21,7 @@ struct compat_iw_point { __u16 length; __u16 flags; }; -#endif -#ifdef CONFIG_COMPAT + struct __compat_iw_event { __u16 len; /* Real length of this stuff */ __u16 cmd; /* Wireless IOCTL */ @@ -49,5 +48,5 @@ struct __compat_iw_event { #define IW_EV_COMPAT_POINT_LEN \ (IW_EV_COMPAT_LCP_LEN + sizeof(struct compat_iw_point) - \ IW_EV_COMPAT_POINT_OFF) -#endif +#endif /* CONFIG_COMPAT */ #endif /* _LINUX_WIRELESS_H */ -- cgit v1.2.3 From 4a8840af5f53f2902eba91130fae650879f18e7a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Oct 2024 12:17:19 -0700 Subject: tracepoints: Use new static branch API The old static key API is deprecated. Switch to the new one. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Alice Ryhl Link: https://lore.kernel.org/7a08dae3c5eddb14b13864923c1b58ac1f4af83c.1728414936.git.jpoimboe@kernel.org Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint-defs.h | 4 ++-- include/linux/tracepoint.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 4dc4955f0fbf..60a6e8314d4c 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -31,7 +31,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct static_key key; + struct static_key_false key; struct static_call_key *static_call_key; void *static_call_tramp; void *iterator; @@ -83,7 +83,7 @@ struct bpf_raw_event_map { #ifdef CONFIG_TRACEPOINTS # define tracepoint_enabled(tp) \ - static_key_false(&(__tracepoint_##tp).key) + static_branch_unlikely(&(__tracepoint_##tp).key) #else # define tracepoint_enabled(tracepoint) false #endif diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 93a9f3070b48..2a29334bbc02 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -248,7 +248,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_RCU(name, proto, args, cond) \ static inline void trace_##name##_rcuidle(proto) \ { \ - if (static_key_false(&__tracepoint_##name.key)) \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ TP_CONDITION(cond), 1); \ @@ -274,7 +274,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_key_false(&__tracepoint_##name.key)) \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ TP_CONDITION(cond), 0); \ @@ -311,7 +311,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline bool \ trace_##name##_enabled(void) \ { \ - return static_key_false(&__tracepoint_##name.key); \ + return static_branch_unlikely(&__tracepoint_##name.key);\ } /* @@ -328,7 +328,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) struct tracepoint __tracepoint_##_name __used \ __section("__tracepoints") = { \ .name = __tpstrtab_##_name, \ - .key = STATIC_KEY_INIT_FALSE, \ + .key = STATIC_KEY_FALSE_INIT, \ .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ .iterator = &__traceiter_##_name, \ -- cgit v1.2.3 From 48bcda6848232667f13b4e97588de488c83c37d4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Oct 2024 18:16:29 -0400 Subject: tracing: Remove definition of trace_*_rcuidle() The trace_*_rcuidle() variant of a tracepoint was to handle places where a tracepoint was located but RCU was not "watching". All those locations have been removed, and RCU should be watching where all tracepoints are located. We can now remove the trace_*_rcuidle() variant. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Joel Fernandes Link: https://lore.kernel.org/20241003181629.36209057@gandalf.local.home Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 50 ++-------------------------------------------- 1 file changed, 2 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 2a29334bbc02..7e4af7b3633c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -196,67 +196,25 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DO_TRACE_CALL(name, args) __traceiter_##name(NULL, args) #endif /* CONFIG_HAVE_STATIC_CALL */ -/* - * ARCH_WANTS_NO_INSTR archs are expected to have sanitized entry and idle - * code that disallow any/all tracing/instrumentation when RCU isn't watching. - */ -#ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define RCUIDLE_COND(rcuidle) (rcuidle) -#else -/* srcu can't be used from NMI */ -#define RCUIDLE_COND(rcuidle) (rcuidle && in_nmi()) -#endif - /* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ -#define __DO_TRACE(name, args, cond, rcuidle) \ +#define __DO_TRACE(name, args, cond) \ do { \ int __maybe_unused __idx = 0; \ \ if (!(cond)) \ return; \ \ - if (WARN_ONCE(RCUIDLE_COND(rcuidle), \ - "Bad RCU usage for tracepoint")) \ - return; \ - \ /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ \ - /* \ - * For rcuidle callers, use srcu since sched-rcu \ - * doesn't work from the idle path. \ - */ \ - if (rcuidle) { \ - __idx = srcu_read_lock_notrace(&tracepoint_srcu);\ - ct_irq_enter_irqson(); \ - } \ - \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ \ - if (rcuidle) { \ - ct_irq_exit_irqson(); \ - srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\ - } \ - \ preempt_enable_notrace(); \ } while (0) -#ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond) \ - static inline void trace_##name##_rcuidle(proto) \ - { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond), 1); \ - } -#else -#define __DECLARE_TRACE_RCU(name, proto, args, cond) -#endif - /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -277,14 +235,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ - TP_CONDITION(cond), 0); \ + TP_CONDITION(cond)); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ } \ - __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ - PARAMS(cond)) \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -375,8 +331,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ static inline void trace_##name(proto) \ { } \ - static inline void trace_##name##_rcuidle(proto) \ - { } \ static inline int \ register_trace_##name(void (*probe)(data_proto), \ void *data) \ -- cgit v1.2.3 From e53244e2c8931f9e80c1841293aea86ef8ad32a3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Oct 2024 18:42:20 -0400 Subject: tracepoint: Remove SRCU protection With the removal of the trace_*_rcuidle() tracepoints, there is no reason to protect tracepoints with SRCU. The reason the SRCU protection was added, was because it can protect tracepoints when RCU is not "watching". Now that tracepoints are only used when RCU is watching, remove the SRCU protection. It just made things more complex and confusing anyway. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Joel Fernandes Link: https://lore.kernel.org/20241003184220.0dc21d35@gandalf.local.home Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 7e4af7b3633c..3d33b9872cec 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -32,8 +32,6 @@ struct trace_eval_map { #define TRACEPOINT_DEFAULT_PRIO 10 -extern struct srcu_struct tracepoint_srcu; - extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); extern int @@ -109,7 +107,6 @@ void for_each_tracepoint_in_module(struct module *mod, #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) { - synchronize_srcu(&tracepoint_srcu); synchronize_rcu(); } #else @@ -207,7 +204,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (!(cond)) \ return; \ \ - /* keep srcu and sched-rcu usage consistent */ \ preempt_disable_notrace(); \ \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ -- cgit v1.2.3 From f7a870d0be12e3ae38cbe899d858994c5b51f22b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 9 Oct 2024 17:15:35 +0900 Subject: ata: libata: Remove unused macro definitions ATA_TMOUT_BOOT and ATA_TMOUT_BOOT_QUICK are not used anywhere. Delete these definitions. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20241009081535.376994-1-dlemoal@kernel.org Signed-off-by: Niklas Cassel --- include/linux/libata.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 9b4a6ff03235..c1a85d46eba6 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -270,9 +270,7 @@ enum { /* bits 24:31 of host->flags are reserved for LLD specific flags */ - /* various lengths of time */ - ATA_TMOUT_BOOT = 30000, /* heuristic */ - ATA_TMOUT_BOOT_QUICK = 7000, /* heuristic */ + /* Various lengths of time */ ATA_TMOUT_INTERNAL_QUICK = 5000, ATA_TMOUT_MAX_PARK = 30000, -- cgit v1.2.3 From d12586e1072d92070783c854819a0ca8c82c5439 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sat, 5 Oct 2024 23:38:25 +0200 Subject: platform/x86: wmi: Implement proper shutdown handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When performing a system shutdown under Windows, all WMI clients are terminated. This means that the ACPI BIOS might expect all WMI devices to be disabled when shutting down. Emulate this behaviour by disabling all active WMI devices during shutdown. Also introduce a new WMI driver callback to allow WMI drivers to perform any device-specific actions before disabling the WMI device. Tested on a Dell Inspiron 3505. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20241005213825.701887-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 3275470b5531..120019677fc6 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -56,6 +56,7 @@ u8 wmidev_instance_count(struct wmi_device *wdev); * @no_singleton: Driver can be instantiated multiple times * @probe: Callback for device binding * @remove: Callback for device unbinding + * @shutdown: Callback for device shutdown * @notify: Callback for receiving WMI events * * This represents WMI drivers which handle WMI devices. @@ -68,6 +69,7 @@ struct wmi_driver { int (*probe)(struct wmi_device *wdev, const void *context); void (*remove)(struct wmi_device *wdev); + void (*shutdown)(struct wmi_device *wdev); void (*notify)(struct wmi_device *device, union acpi_object *data); }; -- cgit v1.2.3 From f042365a26b07006064c2cfa2293e16cad243b0d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 4 Oct 2024 11:20:56 +0100 Subject: net: pcs: xpcs: provide a helper to get the phylink pcs given xpcs Provide a helper to provide the pointer to the phylink_pcs struct given a valid xpcs pointer. This will be necessary when we make struct dw_xpcs private to pcs-xpcs.c Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index abda475111d1..868515f3cc88 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -64,6 +64,7 @@ struct dw_xpcs { bool need_reset; }; +struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs); int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, -- cgit v1.2.3 From accd5f5cd2e1b0ed6805a7c24765648d213561e5 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 4 Oct 2024 11:21:01 +0100 Subject: net: pcs: xpcs: move definition of struct dw_xpcs to private header There should be no reason for anything outside the XPCS code to know the contents of struct dw_xpcs - this is a private structure to XPCS. Move the definition to the private pcs-xpcs.h header, leaving a declaration in the global pcs/pcs-xpcs.h Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 868515f3cc88..b5b5d17998b8 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -21,8 +21,6 @@ #define DW_AN_C37_1000BASEX 4 #define DW_10GBASER 5 -struct dw_xpcs_desc; - enum dw_xpcs_pcs_id { DW_XPCS_ID_NATIVE = 0, NXP_SJA1105_XPCS_ID = 0x00000010, @@ -48,21 +46,7 @@ struct dw_xpcs_info { u32 pma; }; -enum dw_xpcs_clock { - DW_XPCS_CORE_CLK, - DW_XPCS_PAD_CLK, - DW_XPCS_NUM_CLKS, -}; - -struct dw_xpcs { - struct dw_xpcs_info info; - const struct dw_xpcs_desc *desc; - struct mdio_device *mdiodev; - struct clk_bulk_data clks[DW_XPCS_NUM_CLKS]; - struct phylink_pcs pcs; - phy_interface_t interface; - bool need_reset; -}; +struct dw_xpcs; struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs); int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -- cgit v1.2.3 From 57e3707eb5e3d9a45eef9151f0378313b1d39a17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 5 Aug 2024 11:39:36 +0200 Subject: bpf: Constify ctl_table argument of filter function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysctl core is moving to allow "struct ctl_table" in read-only memory. As a preparation for that all functions handling "struct ctl_table" need to be able to work with "const struct ctl_table". As __cgroup_bpf_run_filter_sysctl() does not modify its table, it can be adapted trivially. Signed-off-by: Thomas Weißschuh Signed-off-by: Joel Granados --- include/linux/bpf-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index ce91d9b2acb9..4dd17128b204 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -138,7 +138,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, - struct ctl_table *table, int write, + const struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, enum cgroup_bpf_attach_type atype); -- cgit v1.2.3 From 29e1095bb1ad149b5c417719338d9c81d58bf12b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 5 Aug 2024 11:39:37 +0200 Subject: sysctl: move internal interfaces to const struct ctl_table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a preparation to make all the core sysctl code work with const struct ctl_table switch over the internal function to use the const variant. Some pointers to "struct ctl_table" need to stay non-const as they are newly allocated and modified before registration. Signed-off-by: Thomas Weißschuh Signed-off-by: Joel Granados --- include/linux/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index aa4c6d44aaa0..a473deaf5a91 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -162,7 +162,7 @@ struct ctl_node { struct ctl_table_header { union { struct { - struct ctl_table *ctl_table; + const struct ctl_table *ctl_table; int ctl_table_size; int used; int count; -- cgit v1.2.3 From 7abc9b53bd515d7953d1f4e069b062ec4b5ba9e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 5 Aug 2024 11:39:38 +0200 Subject: sysctl: allow registration of const struct ctl_table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Putting structure, especially those containing function pointers, into read-only memory makes the safer and easier to reason about. Change the sysctl registration APIs to allow registration of "const struct ctl_table". Signed-off-by: Thomas Weißschuh Acked-by: Kees Cook Reviewed-by: Kees Cook # security/* Signed-off-by: Joel Granados --- include/linux/sysctl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index a473deaf5a91..202855befa8b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -223,13 +223,13 @@ extern void retire_sysctl_set(struct ctl_table_set *set); struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, - const char *path, struct ctl_table *table, size_t table_size); -struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table, + const char *path, const struct ctl_table *table, size_t table_size); +struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table, size_t table_size); void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init_bases(void); -extern void __register_sysctl_init(const char *path, struct ctl_table *table, +extern void __register_sysctl_init(const char *path, const struct ctl_table *table, const char *table_name, size_t table_size); #define register_sysctl_init(path, table) \ __register_sysctl_init(path, table, #table, ARRAY_SIZE(table)) @@ -251,7 +251,7 @@ extern int no_unaligned_warning; #else /* CONFIG_SYSCTL */ -static inline void register_sysctl_init(const char *path, struct ctl_table *table) +static inline void register_sysctl_init(const char *path, const struct ctl_table *table) { } @@ -261,7 +261,7 @@ static inline struct ctl_table_header *register_sysctl_mount_point(const char *p } static inline struct ctl_table_header *register_sysctl_sz(const char *path, - struct ctl_table *table, + const struct ctl_table *table, size_t table_size) { return NULL; -- cgit v1.2.3 From 823a566221a5639f6c69424897218e5d6431a970 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 9 Oct 2024 11:20:31 +0200 Subject: locking/ww_mutex: Adjust to lockdep nest_lock requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using mutex_acquire_nest() with a nest_lock, lockdep refcounts the number of acquired lockdep_maps of mutexes of the same class, and also keeps a pointer to the first acquired lockdep_map of a class. That pointer is then used for various comparison-, printing- and checking purposes, but there is no mechanism to actively ensure that lockdep_map stays in memory. Instead, a warning is printed if the lockdep_map is freed and there are still held locks of the same lock class, even if the lockdep_map itself has been released. In the context of WW/WD transactions that means that if a user unlocks and frees a ww_mutex from within an ongoing ww transaction, and that mutex happens to be the first ww_mutex grabbed in the transaction, such a warning is printed and there might be a risk of a UAF. Note that this is only problem when lockdep is enabled and affects only dereferences of struct lockdep_map. Adjust to this by adding a fake lockdep_map to the acquired context and make sure it is the first acquired lockdep map of the associated ww_mutex class. Then hold it for the duration of the WW/WD transaction. This has the side effect that trying to lock a ww mutex *without* a ww_acquire_context but where a such context has been acquire, we'd see a lockdep splat. The test-ww_mutex.c selftest attempts to do that, so modify that particular test to not acquire a ww_acquire_context if it is not going to be used. Signed-off-by: Thomas Hellström Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241009092031.6356-1-thomas.hellstrom@linux.intel.com --- include/linux/ww_mutex.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index bb763085479a..a401a2f31a77 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -65,6 +65,16 @@ struct ww_acquire_ctx { #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; + /** + * @first_lock_dep_map: fake lockdep_map for first locked ww_mutex. + * + * lockdep requires the lockdep_map for the first locked ww_mutex + * in a ww transaction to remain in memory until all ww_mutexes of + * the transaction have been unlocked. Ensure this by keeping a + * fake locked ww_mutex lockdep map between ww_acquire_init() and + * ww_acquire_fini(). + */ + struct lockdep_map first_lock_dep_map; #endif #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH unsigned int deadlock_inject_interval; @@ -146,7 +156,10 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, debug_check_no_locks_freed((void *)ctx, sizeof(*ctx)); lockdep_init_map(&ctx->dep_map, ww_class->acquire_name, &ww_class->acquire_key, 0); + lockdep_init_map(&ctx->first_lock_dep_map, ww_class->mutex_name, + &ww_class->mutex_key, 0); mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_); + mutex_acquire_nest(&ctx->first_lock_dep_map, 0, 0, &ctx->dep_map, _RET_IP_); #endif #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH ctx->deadlock_inject_interval = 1; @@ -185,6 +198,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) { #ifdef CONFIG_DEBUG_LOCK_ALLOC + mutex_release(&ctx->first_lock_dep_map, _THIS_IP_); mutex_release(&ctx->dep_map, _THIS_IP_); #endif #ifdef DEBUG_WW_MUTEXES -- cgit v1.2.3 From 5461f3fd74a89757f95f351eb0bc26aafc2a2e91 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 20 Sep 2024 00:27:58 +0100 Subject: backlight: Remove notifier backlight_register_notifier and backlight_unregister_notifier have been unused since commit 6cb634d0dc85 ("ACPI: video: Remove code to unregister acpi_video backlight when a native backlight registers") With those not being called, it means that the backlight_notifier list is always empty. Remove the functions, the list itself and the enum used in the notifications. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Daniel Thompson Reviewed-by: Simona Vetter Link: https://lore.kernel.org/r/20240919232758.639925-1-linux@treblig.org Signed-off-by: Lee Jones --- include/linux/backlight.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index ea9c1bc8148e..f5652e5a9060 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -66,24 +66,6 @@ enum backlight_type { BACKLIGHT_TYPE_MAX, }; -/** - * enum backlight_notification - the type of notification - * - * The notifications that is used for notification sent to the receiver - * that registered notifications using backlight_register_notifier(). - */ -enum backlight_notification { - /** - * @BACKLIGHT_REGISTERED: The backlight device is registered. - */ - BACKLIGHT_REGISTERED, - - /** - * @BACKLIGHT_UNREGISTERED: The backlight revice is unregistered. - */ - BACKLIGHT_UNREGISTERED, -}; - /** enum backlight_scale - the type of scale used for brightness values * * The type of scale used for brightness values. @@ -421,8 +403,6 @@ void devm_backlight_device_unregister(struct device *dev, struct backlight_device *bd); void backlight_force_update(struct backlight_device *bd, enum backlight_update_reason reason); -int backlight_register_notifier(struct notifier_block *nb); -int backlight_unregister_notifier(struct notifier_block *nb); struct backlight_device *backlight_device_get_by_name(const char *name); struct backlight_device *backlight_device_get_by_type(enum backlight_type type); int backlight_device_set_brightness(struct backlight_device *bd, -- cgit v1.2.3 From 6f1067cfbee72b04fc42234f7f1588f838cec0b6 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Tue, 1 Oct 2024 13:53:27 +0200 Subject: mfd: Add Congatec Board Controller driver Add core MFD driver for the Board Controller found on some Congatec SMARC module. This Board Controller provides functions like watchdog, GPIO, and I2C busses. This commit adds support only for the conga-SA7 module. Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20241001-congatec-board-controller-v3-1-39ceceed5c47@bootlin.com Signed-off-by: Lee Jones --- include/linux/mfd/cgbc.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 include/linux/mfd/cgbc.h (limited to 'include/linux') diff --git a/include/linux/mfd/cgbc.h b/include/linux/mfd/cgbc.h new file mode 100644 index 000000000000..badbec4c7033 --- /dev/null +++ b/include/linux/mfd/cgbc.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Congatec Board Controller driver definitions + * + * Copyright (C) 2024 Bootlin + * Author: Thomas Richard + */ + +#ifndef _LINUX_MFD_CGBC_H_ + +/** + * struct cgbc_version - Board Controller device version structure + * @feature: Board Controller feature number + * @major: Board Controller major revision + * @minor: Board Controller minor revision + */ +struct cgbc_version { + unsigned char feature; + unsigned char major; + unsigned char minor; +}; + +/** + * struct cgbc_device_data - Internal representation of the Board Controller device + * @io_session: Pointer to the session IO memory + * @io_cmd: Pointer to the command IO memory + * @session: Session id returned by the Board Controller + * @dev: Pointer to kernel device structure + * @cgbc_version: Board Controller version structure + * @mutex: Board Controller mutex + */ +struct cgbc_device_data { + void __iomem *io_session; + void __iomem *io_cmd; + u8 session; + struct device *dev; + struct cgbc_version version; + struct mutex lock; +}; + +int cgbc_command(struct cgbc_device_data *cgbc, void *cmd, unsigned int cmd_size, + void *data, unsigned int data_size, u8 *status); + +#endif /*_LINUX_MFD_CGBC_H_*/ -- cgit v1.2.3 From 0e6caab8db8bc9359d1a8363e1ee9b2205ed704d Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:11 -0400 Subject: tracing: Declare system call tracepoints with TRACE_EVENT_SYSCALL In preparation for allowing system call tracepoints to handle page faults, introduce TRACE_EVENT_SYSCALL to declare the sys_enter/sys_exit tracepoints. Move the common code between __DECLARE_TRACE and __DECLARE_TRACE_SYSCALL into __DECLARE_TRACE_COMMON. This change is not meant to alter the generated code, and only prepares the following modifications. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-2-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 53 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 3d33b9872cec..76e441b39a96 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -197,7 +197,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. */ -#define __DO_TRACE(name, args, cond) \ +#define __DO_TRACE(name, args, cond, syscall) \ do { \ int __maybe_unused __idx = 0; \ \ @@ -222,21 +222,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * site if it is not watching, as it will need to be active when the * tracepoint is enabled. */ -#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ +#define __DECLARE_TRACE_COMMON(name, proto, args, cond, data_proto) \ extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ - static inline void trace_##name(proto) \ - { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond)); \ - if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ - WARN_ONCE(!rcu_is_watching(), \ - "RCU not watching for tracepoint"); \ - } \ - } \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -266,6 +255,34 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) return static_branch_unlikely(&__tracepoint_##name.key);\ } +#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void trace_##name(proto) \ + { \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 0); \ + if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ + } \ + } + +#define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void trace_##name(proto) \ + { \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 1); \ + if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ + } \ + } + /* * We have no guarantee that gcc and the linker won't up-align the tracepoint * structures, so we create an array of pointers that will be used for iteration @@ -348,6 +365,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) return false; \ } +#define __DECLARE_TRACE_SYSCALL __DECLARE_TRACE + #define DEFINE_TRACE_FN(name, reg, unreg, proto, args) #define DEFINE_TRACE(name, proto, args) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) @@ -409,6 +428,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ PARAMS(void *__data, proto)) +#define DECLARE_TRACE_SYSCALL(name, proto, args) \ + __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()), \ + PARAMS(void *__data, proto)) + #define TRACE_EVENT_FLAGS(event, flag) #define TRACE_EVENT_PERF_PERM(event, expr...) @@ -546,6 +570,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) struct, assign, print) \ DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) +#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \ + print, reg, unreg) \ + DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FLAGS(event, flag) -- cgit v1.2.3 From a363d27cdbc2bc2d1899b5a1520b64e3590fcd9a Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:07:15 -0400 Subject: tracing: Allow system call tracepoints to handle page faults Use Tasks Trace RCU to protect iteration of system call enter/exit tracepoint probes to allow those probes to handle page faults. In preparation for this change, all tracers registering to system call enter/exit tracepoints should expect those to be called with preemption enabled. This allows tracers to fault-in userspace system call arguments such as path strings within their probe callbacks. Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Link: https://lore.kernel.org/20241009010718.2050182-6-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 76e441b39a96..0dc67fad706c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -107,6 +108,7 @@ void for_each_tracepoint_in_module(struct module *mod, #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) { + synchronize_rcu_tasks_trace(); synchronize_rcu(); } #else @@ -196,6 +198,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) /* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. + * + * With @syscall=0, the tracepoint callback array dereference is + * protected by disabling preemption. + * With @syscall=1, the tracepoint callback array dereference is + * protected by Tasks Trace RCU, which allows probes to handle page + * faults. */ #define __DO_TRACE(name, args, cond, syscall) \ do { \ @@ -204,11 +212,17 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (!(cond)) \ return; \ \ - preempt_disable_notrace(); \ + if (syscall) \ + rcu_read_lock_trace(); \ + else \ + preempt_disable_notrace(); \ \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ \ - preempt_enable_notrace(); \ + if (syscall) \ + rcu_read_unlock_trace(); \ + else \ + preempt_enable_notrace(); \ } while (0) /* -- cgit v1.2.3 From 21291491e3f39d1dac6453e376f7619b21239b5e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 9 Oct 2024 01:35:52 +0100 Subject: clk: Remove unused clk_hw_rate_is_protected clk_hw_rate_is_protected() was added in 2017's commit e55a839a7a1c ("clk: add clock protection mechanism to clk core") but has been unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20241009003552.254675-1-linux@treblig.org Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 7e43caabb54b..5cbd112d1187 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1360,7 +1360,6 @@ unsigned long clk_hw_get_flags(const struct clk_hw *hw); (clk_hw_get_flags((hw)) & CLK_SET_RATE_PARENT) bool clk_hw_is_prepared(const struct clk_hw *hw); -bool clk_hw_rate_is_protected(const struct clk_hw *hw); bool clk_hw_is_enabled(const struct clk_hw *hw); bool __clk_is_enabled(struct clk *clk); struct clk *__clk_lookup(const char *name); -- cgit v1.2.3 From 87173021f1583ee37f4801fcde354729da8db3dc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:03 -0700 Subject: ipv4: Link IPv4 address to per-netns hash table. As a prep for per-netns RTNL conversion, we want to namespacify the IPv4 address hash table and the GC work. Let's allocate the per-netns IPv4 address hash table to net->ipv4.inet_addr_lst and link IPv4 addresses into it. The actual users will be converted later. Note that the IPv6 address hash table is already namespacified. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index cb5280e6cc21..d0c2bf67a9b0 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -142,6 +142,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) struct in_ifaddr { struct hlist_node hash; + struct hlist_node addr_lst; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; struct rcu_head rcu_head; -- cgit v1.2.3 From 99ee348e6a41cf24b334a1bb7cde87239e8e2d95 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:06 -0700 Subject: ipv4: Retire global IPv4 hash table inet_addr_lst. No one uses inet_addr_lst anymore, so let's remove it. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index d0c2bf67a9b0..d9c690c8c80b 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -141,7 +141,6 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) ARP_EVICT_NOCARRIER) struct in_ifaddr { - struct hlist_node hash; struct hlist_node addr_lst; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; -- cgit v1.2.3 From ee3283c608dfa21251b0821d7bb198c7ae3189f6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:16 -0400 Subject: timekeeping: Add interfaces for handling timestamps with a floor value Multigrain timestamps allow the kernel to use fine-grained timestamps when an inode's attributes is being actively observed via ->getattr(). With this support, it's possible for a file to get a fine-grained timestamp, and another modified after it to get a coarse-grained stamp that is earlier than the fine-grained time. If this happens then the files can appear to have been modified in reverse order, which breaks VFS ordering guarantees [1]. To prevent this, maintain a floor value for multigrain timestamps. Whenever a fine-grained timestamp is handed out, record it, and when later coarse-grained stamps are handed out, ensure they are not earlier than that value. If the coarse-grained timestamp is earlier than the fine-grained floor, return the floor value instead. Add a static singleton atomic64_t into timekeeper.c that is used to keep track of the latest fine-grained time ever handed out. This is tracked as a monotonic ktime_t value to ensure that it isn't affected by clock jumps. Because it is updated at different times than the rest of the timekeeper object, the floor value is managed independently of the timekeeper via a cmpxchg() operation, and sits on its own cacheline. Add two new public interfaces: - ktime_get_coarse_real_ts64_mg() fills a timespec64 with the later of the coarse-grained clock and the floor time - ktime_get_real_ts64_mg() gets the fine-grained clock value, and tries to swap it into the floor. A timespec64 is filled with the result. The floor value is global and updated via a single try_cmpxchg(). If that fails then the operation raced with a concurrent update. Any concurrent update must be later than the existing floor value, so any racing tasks can accept any resulting floor value without retrying. [1]: POSIX requires that files be stamped with realtime clock values, and makes no provision for dealing with backward clock jumps. If a backward realtime clock jump occurs, then files can appear to have been modified in reverse order. Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap # documentation bits Acked-by: John Stultz Link: https://lore.kernel.org/all/20241002-mgtime-v10-1-d1c4717f5284@kernel.org Signed-off-by: Christian Brauner --- include/linux/timekeeping.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fc12a9ba2c88..7aa85246c183 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -45,6 +45,10 @@ extern void ktime_get_real_ts64(struct timespec64 *tv); extern void ktime_get_coarse_ts64(struct timespec64 *ts); extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); +/* Multigrain timestamp interfaces */ +extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); +extern void ktime_get_real_ts64_mg(struct timespec64 *ts); + void getboottime64(struct timespec64 *ts); /* -- cgit v1.2.3 From 2a15385742c689a271345dcbb4c28b9c568bc7ce Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:17 -0400 Subject: timekeeping: Add percpu counter for tracking floor swap events The mgtime_floor value is a global variable for tracking the latest fine-grained timestamp handed out. Because it's a global, track the number of times that a new floor value is assigned. Add a new percpu counter to the timekeeping code to track the number of floor swap events that have occurred. A later patch will add a debugfs file to display this counter alongside other stats involving multigrain timestamps. Signed-off-by: Jeff Layton Signed-off-by: Thomas Gleixner Tested-by: Randy Dunlap # documentation bits Link: https://lore.kernel.org/all/20241002-mgtime-v10-2-d1c4717f5284@kernel.org Signed-off-by: Christian Brauner --- include/linux/timekeeping.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7aa85246c183..84a035e86ac8 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -48,6 +48,7 @@ extern void ktime_get_coarse_real_ts64(struct timespec64 *ts); /* Multigrain timestamp interfaces */ extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts); extern void ktime_get_real_ts64_mg(struct timespec64 *ts); +extern unsigned long timekeeping_get_mg_floor_swaps(void); void getboottime64(struct timespec64 *ts); -- cgit v1.2.3 From 7f2c86cba3c584c7227cddaabdf0ab54c8151e60 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Oct 2024 17:27:20 -0400 Subject: fs: handle delegated timestamps in setattr_copy_mgtime An update to the inode ctime typically requires the latest clock value possible. The exception to this rule is when there is a nfsd write delegation and the server is proxying timestamps from the client. When nfsd gets a CB_GETATTR response, update the timestamp value in the inode to the values that the client is tracking. The client doesn't send a ctime value (since that's always determined by the exported filesystem), but it can send a mtime value. In the case where it does, update the ctime to a value commensurate with that instead of the current time. If ATTR_DELEG is set, then use ia_ctime value instead of setting the timestamp to the current time. With the addition of delegated timestamps, the server may receive a request to update only the atime, which doesn't involve a ctime update. Trust the ATTR_CTIME flag in the update and only update the ctime when it's set. Tested-by: Randy Dunlap # documentation bits Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241002-mgtime-v10-5-d1c4717f5284@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index eff688e75f2f..ea7ed437d2b1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1544,6 +1544,8 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb, struct timespec64 current_time(struct inode *inode); struct timespec64 inode_set_ctime_current(struct inode *inode); +struct timespec64 inode_set_ctime_deleg(struct inode *inode, + struct timespec64 update); static inline time64_t inode_get_atime_sec(const struct inode *inode) { -- cgit v1.2.3 From 016f426a14f09faa8bdb68b063c2947edf3108a1 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 8 Oct 2024 21:32:09 +0300 Subject: net/mlx5: qos: Flesh out element_attributes in mlx5_ifc.h This is used for multiple purposes, depending on the scheduling element created. There are a few helper struct defined a long time ago, but they are not easy to find in the file and they are about to get new members. This commit cleans up this area a bit by: - moving the helper structs closer to where they are relevant. - defining a helper union to include all of them to help discoverability. - making use of it everywhere element_attributes is used. - using a consistent 'attr' name. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- include/linux/mlx5/mlx5_ifc.h | 67 ++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 96d369112bfa..c79ba6197673 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4105,11 +4105,47 @@ enum { ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4, }; +enum { + TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0, + TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1, + TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2, +}; + +enum { + TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, + TSAR_TYPE_CAP_MASK_ROUND_ROBIN = 1 << 1, + TSAR_TYPE_CAP_MASK_ETS = 1 << 2, +}; + +struct mlx5_ifc_tsar_element_bits { + u8 reserved_at_0[0x8]; + u8 tsar_type[0x8]; + u8 reserved_at_10[0x10]; +}; + +struct mlx5_ifc_vport_element_bits { + u8 reserved_at_0[0x10]; + u8 vport_number[0x10]; +}; + +struct mlx5_ifc_vport_tc_element_bits { + u8 traffic_class[0x4]; + u8 reserved_at_4[0xc]; + u8 vport_number[0x10]; +}; + +union mlx5_ifc_element_attributes_bits { + struct mlx5_ifc_tsar_element_bits tsar; + struct mlx5_ifc_vport_element_bits vport; + struct mlx5_ifc_vport_tc_element_bits vport_tc; + u8 reserved_at_0[0x20]; +}; + struct mlx5_ifc_scheduling_context_bits { u8 element_type[0x8]; u8 reserved_at_8[0x18]; - u8 element_attributes[0x20]; + union mlx5_ifc_element_attributes_bits element_attributes; u8 parent_element_id[0x20]; @@ -4798,35 +4834,6 @@ struct mlx5_ifc_register_loopback_control_bits { u8 reserved_at_20[0x60]; }; -struct mlx5_ifc_vport_tc_element_bits { - u8 traffic_class[0x4]; - u8 reserved_at_4[0xc]; - u8 vport_number[0x10]; -}; - -struct mlx5_ifc_vport_element_bits { - u8 reserved_at_0[0x10]; - u8 vport_number[0x10]; -}; - -enum { - TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0, - TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1, - TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2, -}; - -enum { - TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, - TSAR_TYPE_CAP_MASK_ROUND_ROBIN = 1 << 1, - TSAR_TYPE_CAP_MASK_ETS = 1 << 2, -}; - -struct mlx5_ifc_tsar_element_bits { - u8 reserved_at_0[0x8]; - u8 tsar_type[0x8]; - u8 reserved_at_10[0x10]; -}; - enum { MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_SUCCESS = 0x0, MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL = 0x1, -- cgit v1.2.3 From fceffbfe57af7d9941d08e1a995cccf558d08451 Mon Sep 17 00:00:00 2001 From: Patrick Rudolph Date: Wed, 2 Oct 2024 14:54:58 +0200 Subject: regulator: max5970: Drop unused structs After splitting the max5970 into a MFD device clean the remaining code and drop unused structs. The struct max5970_data and enum max5970_chip_type aren't used. Signed-off-by: Patrick Rudolph Acked-by: Lee Jones Link: https://patch.msgid.link/20241002125500.78278-1-patrick.rudolph@9elements.com Signed-off-by: Mark Brown --- include/linux/mfd/max5970.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max5970.h b/include/linux/mfd/max5970.h index 762a7d40c843..fc50e89edfaa 100644 --- a/include/linux/mfd/max5970.h +++ b/include/linux/mfd/max5970.h @@ -16,18 +16,6 @@ #define MAX5978_NUM_SWITCHES 1 #define MAX5970_NUM_LEDS 4 -struct max5970_data { - int num_switches; - u32 irng[MAX5970_NUM_SWITCHES]; - u32 mon_rng[MAX5970_NUM_SWITCHES]; - u32 shunt_micro_ohms[MAX5970_NUM_SWITCHES]; -}; - -enum max5970_chip_type { - TYPE_MAX5978 = 1, - TYPE_MAX5970, -}; - #define MAX5970_REG_CURRENT_L(ch) (0x01 + (ch) * 4) #define MAX5970_REG_CURRENT_H(ch) (0x00 + (ch) * 4) #define MAX5970_REG_VOLTAGE_L(ch) (0x03 + (ch) * 4) -- cgit v1.2.3 From 0e8158b4a82eb5bfb69df17510d210b073896f00 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 2 Oct 2024 14:22:24 +0200 Subject: OPP: Rework _set_required_devs() to manage a single device per call At this point there are no consumer drivers that makes use of _set_required_devs(), hence it should be straightforward to rework the code to enable it to better integrate with the PM domain attach procedure. During attach, one device at the time is being hooked up to its corresponding PM domain. Therefore, let's update the _set_required_devs() to align to this behaviour, allowing callers to fill out one required_dev per call. Subsequent changes starts making use of this. Signed-off-by: Ulf Hansson Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20241002122232.194245-4-ulf.hansson@linaro.org --- include/linux/pm_opp.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 6424692c30b7..bc74bc69107a 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -63,10 +63,11 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table, * @supported_hw_count: Number of elements in the array. * @regulator_names: Array of pointers to the names of the regulator, NULL terminated. * @genpd_names: Null terminated array of pointers containing names of genpd to - * attach. Mutually exclusive with required_devs. + * attach. Mutually exclusive with required_dev. * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually - * exclusive with required_devs. - * @required_devs: Required OPP devices. Mutually exclusive with genpd_names/virt_devs. + * exclusive with required_dev. + * @required_dev: Required OPP device. Mutually exclusive with genpd_names/virt_devs. + * @required_dev_index: The index of the required OPP for the @required_dev. * * This structure contains platform specific OPP configurations for the device. */ @@ -81,7 +82,8 @@ struct dev_pm_opp_config { const char * const *regulator_names; const char * const *genpd_names; struct device ***virt_devs; - struct device **required_devs; + struct device *required_dev; + unsigned int required_dev_index; }; #define OPP_LEVEL_UNSET U32_MAX -- cgit v1.2.3 From 98d277a79126a2df8a2617215846d01f823cfffa Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 2 Oct 2024 14:22:25 +0200 Subject: PM: domains: Support required OPPs in dev_pm_domain_attach_list() In the multiple PM domain case we need platform code to specify the index of the corresponding required OPP in DT for a device, which is what *_opp_attach_genpd() is there to help us with. However, attaching a device to its PM domains is in general better done with dev_pm_domain_attach_list(). To avoid having two different ways to manage this and to prepare for the removal of *_opp_attach_genpd(), let's extend dev_pm_domain_attach|detach_list() to manage the required OPPs too. Signed-off-by: Ulf Hansson Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20241002122232.194245-5-ulf.hansson@linaro.org --- include/linux/pm_domain.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index b637ec14025f..92f9d56f623d 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -30,9 +30,16 @@ * supplier and its PM domain when creating the * device-links. * + * PD_FLAG_REQUIRED_OPP: Assign required_devs for the required OPPs. The + * index of the required OPP must correspond to the + * index in the array of the pd_names. If pd_names + * isn't specified, the index just follows the + * index for the attached PM domain. + * */ #define PD_FLAG_NO_DEV_LINK BIT(0) #define PD_FLAG_DEV_LINK_ON BIT(1) +#define PD_FLAG_REQUIRED_OPP BIT(2) struct dev_pm_domain_attach_data { const char * const *pd_names; @@ -43,6 +50,7 @@ struct dev_pm_domain_attach_data { struct dev_pm_domain_list { struct device **pd_devs; struct device_link **pd_links; + u32 *opp_tokens; u32 num_pds; }; -- cgit v1.2.3 From e130ca9d4873f8d01e3e7b8137e0c14196a3cfb4 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 2 Oct 2024 14:22:27 +0200 Subject: pmdomain: core: Set the required dev for a required OPP during genpd attach In the single PM domain case there is no need for platform code to specify the index of the corresponding required OPP in DT, as the index must be zero. This allows us to assign a required dev for the required OPP from genpd, while attaching a device to its PM domain. In this way, we can remove some of the genpd specific code in the OPP core for the single PM domain case. Although, this cleanup is made from a subsequent change. Signed-off-by: Ulf Hansson Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20241002122232.194245-7-ulf.hansson@linaro.org --- include/linux/pm_domain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 92f9d56f623d..76775ab38898 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -252,6 +252,7 @@ struct generic_pm_domain_data { unsigned int performance_state; unsigned int default_pstate; unsigned int rpm_pstate; + unsigned int opp_token; bool hw_mode; void *data; }; -- cgit v1.2.3 From d6caca30a548764f8cfd78393ea09fefdf285212 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 2 Oct 2024 14:22:32 +0200 Subject: OPP: Drop redundant *_opp_attach|detach_genpd() All users of *_opp_attach|detach_genpd(), have been converted to use dev|devm_pm_domain_attach|detach_list(), hence let's drop it along with its corresponding exported functions. Acked-by: Viresh Kumar Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20241002122232.194245-12-ulf.hansson@linaro.org --- include/linux/pm_opp.h | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index bc74bc69107a..568183e3e641 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -62,11 +62,7 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table, * @supported_hw: Array of hierarchy of versions to match. * @supported_hw_count: Number of elements in the array. * @regulator_names: Array of pointers to the names of the regulator, NULL terminated. - * @genpd_names: Null terminated array of pointers containing names of genpd to - * attach. Mutually exclusive with required_dev. - * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually - * exclusive with required_dev. - * @required_dev: Required OPP device. Mutually exclusive with genpd_names/virt_devs. + * @required_dev: The required OPP device. * @required_dev_index: The index of the required OPP for the @required_dev. * * This structure contains platform specific OPP configurations for the device. @@ -80,8 +76,6 @@ struct dev_pm_opp_config { const unsigned int *supported_hw; unsigned int supported_hw_count; const char * const *regulator_names; - const char * const *genpd_names; - struct device ***virt_devs; struct device *required_dev; unsigned int required_dev_index; }; @@ -677,36 +671,6 @@ static inline void dev_pm_opp_put_config_regulators(int token) dev_pm_opp_clear_config(token); } -/* genpd helpers */ -static inline int dev_pm_opp_attach_genpd(struct device *dev, - const char * const *names, - struct device ***virt_devs) -{ - struct dev_pm_opp_config config = { - .genpd_names = names, - .virt_devs = virt_devs, - }; - - return dev_pm_opp_set_config(dev, &config); -} - -static inline void dev_pm_opp_detach_genpd(int token) -{ - dev_pm_opp_clear_config(token); -} - -static inline int devm_pm_opp_attach_genpd(struct device *dev, - const char * const *names, - struct device ***virt_devs) -{ - struct dev_pm_opp_config config = { - .genpd_names = names, - .virt_devs = virt_devs, - }; - - return devm_pm_opp_set_config(dev, &config); -} - /* prop-name helpers */ static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name) { -- cgit v1.2.3 From 13d68a16430312fc21990f48326366eb73891202 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:47 +0200 Subject: genetlink: extend info user-storage to match NL cb ctx This allows a more uniform implementation of non-dump and dump operations, and will be used later in the series to avoid some per-operation allocation. Additionally rename the NL_ASSERT_DUMP_CTX_FITS macro, to fit a more extended usage. Suggested-by: Jakub Kicinski Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/1130cc2896626b84587a2a5f96a5c6829638f4da.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index b332c2048c75..a3ca198a3a9e 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -34,6 +34,7 @@ struct netlink_skb_parms { #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) +#define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); @@ -293,7 +294,7 @@ struct netlink_callback { int flags; bool strict_check; union { - u8 ctx[48]; + u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. @@ -302,7 +303,7 @@ struct netlink_callback { }; }; -#define NL_ASSERT_DUMP_CTX_FITS(type_name) \ +#define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) -- cgit v1.2.3 From 4b623f9f0f59652ea71fcb27d60b4c3b65126dbb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:49 +0200 Subject: net-shapers: implement NL get operation Introduce the basic infrastructure to implement the net-shaper core functionality. Each network devices carries a net-shaper cache, the NL get() operation fetches the data from such cache. The cache is initially empty, will be fill by the set()/group() operation implemented later and is destroyed at device cleanup time. The net_shaper_fill_handle(), net_shaper_ctx_init(), and net_shaper_generic_pre() implementations handle generic index type attributes, despite the current caller always pass a constant value to avoid more noise in later patches using them with different attributes. Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/ddd10fd645a9367803ad02fca4a5664ea5ace170.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3baf8e539b6f..e6b93d01e631 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1603,6 +1603,14 @@ struct net_device_ops { int (*ndo_hwtstamp_set)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); + +#if IS_ENABLED(CONFIG_NET_SHAPER) + /** + * @net_shaper_ops: Device shaping offload operations + * see include/net/net_shapers.h + */ + const struct net_shaper_ops *net_shaper_ops; +#endif }; /** @@ -2406,6 +2414,19 @@ struct net_device { u64 max_pacing_offload_horizon; + /** + * @lock: protects @net_shaper_hierarchy, feel free to use for other + * netdev-scope protection. Ordering: take after rtnl_lock. + */ + struct mutex lock; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + /** + * @net_shaper_hierarchy: data tracking the current shaper status + * see include/net/net_shapers.h + */ + struct net_shaper_hierarchy *net_shaper_hierarchy; +#endif u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; -- cgit v1.2.3 From 608a5c05c39b75fa2539ce9e521d289c5a5326f7 Mon Sep 17 00:00:00 2001 From: Wenjun Wu Date: Wed, 9 Oct 2024 10:09:58 +0200 Subject: virtchnl: support queue rate limit and quanta size configuration This patch adds new virtchnl opcodes and structures for rate limit and quanta size configuration, which include: 1. VIRTCHNL_OP_CONFIG_QUEUE_BW, to configure max bandwidth for each VF per queue. 2. VIRTCHNL_OP_CONFIG_QUANTA, to configure quanta size per queue. 3. VIRTCHNL_OP_GET_QOS_CAPS, VF queries current QoS configuration, such as enabled TCs, arbiter type, up2tc and bandwidth of VSI node. The configuration is previously set by DCB and PF, and now is the potential QoS capability of VF. VF can take it as reference to configure queue TC mapping. Reviewed-by: Jiri Pirko Signed-off-by: Wenjun Wu Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/839002f7bd6f63b985a060a51b079f6e6dbbe237.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/avf/virtchnl.h | 119 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index f41395264dca..223e433c39fe 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -89,6 +89,9 @@ enum virtchnl_rx_hsplit { VIRTCHNL_RX_HSPLIT_SPLIT_SCTP = 8, }; +enum virtchnl_bw_limit_type { + VIRTCHNL_BW_SHAPER = 0, +}; /* END GENERIC DEFINES */ /* Opcodes for VF-PF communication. These are placed in the v_opcode field @@ -151,6 +154,11 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55, VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 = 56, VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57, + /* opcode 57 - 65 are reserved */ + VIRTCHNL_OP_GET_QOS_CAPS = 66, + /* opcode 68 through 111 are reserved */ + VIRTCHNL_OP_CONFIG_QUEUE_BW = 112, + VIRTCHNL_OP_CONFIG_QUANTA = 113, VIRTCHNL_OP_MAX, }; @@ -261,6 +269,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC BIT(26) #define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF BIT(27) #define VIRTCHNL_VF_OFFLOAD_FDIR_PF BIT(28) +#define VIRTCHNL_VF_OFFLOAD_QOS BIT(29) #define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \ VIRTCHNL_VF_OFFLOAD_VLAN | \ @@ -1416,6 +1425,85 @@ struct virtchnl_fdir_del { VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del); +struct virtchnl_shaper_bw { + /* Unit is Kbps */ + u32 committed; + u32 peak; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_shaper_bw); + +/* VIRTCHNL_OP_GET_QOS_CAPS + * VF sends this message to get its QoS Caps, such as + * TC number, Arbiter and Bandwidth. + */ +struct virtchnl_qos_cap_elem { + u8 tc_num; + u8 tc_prio; +#define VIRTCHNL_ABITER_STRICT 0 +#define VIRTCHNL_ABITER_ETS 2 + u8 arbiter; +#define VIRTCHNL_STRICT_WEIGHT 1 + u8 weight; + enum virtchnl_bw_limit_type type; + union { + struct virtchnl_shaper_bw shaper; + u8 pad2[32]; + }; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_qos_cap_elem); + +struct virtchnl_qos_cap_list { + u16 vsi_id; + u16 num_elem; + struct virtchnl_qos_cap_elem cap[]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(4, virtchnl_qos_cap_list); +#define virtchnl_qos_cap_list_LEGACY_SIZEOF 44 + +/* VIRTCHNL_OP_CONFIG_QUEUE_BW */ +struct virtchnl_queue_bw { + u16 queue_id; + u8 tc; + u8 pad; + struct virtchnl_shaper_bw shaper; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_queue_bw); + +struct virtchnl_queues_bw_cfg { + u16 vsi_id; + u16 num_queues; + struct virtchnl_queue_bw cfg[]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(4, virtchnl_queues_bw_cfg); +#define virtchnl_queues_bw_cfg_LEGACY_SIZEOF 16 + +enum virtchnl_queue_type { + VIRTCHNL_QUEUE_TYPE_TX = 0, + VIRTCHNL_QUEUE_TYPE_RX = 1, +}; + +/* structure to specify a chunk of contiguous queues */ +struct virtchnl_queue_chunk { + /* see enum virtchnl_queue_type */ + s32 type; + u16 start_queue_id; + u16 num_queues; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_queue_chunk); + +struct virtchnl_quanta_cfg { + u16 quanta_size; + struct virtchnl_queue_chunk queue_select; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_quanta_cfg); + #define __vss_byone(p, member, count, old) \ (struct_size(p, member, count) + (old - 1 - struct_size(p, member, 0))) @@ -1438,6 +1526,8 @@ VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del); __vss(virtchnl_vlan_filter_list_v2, __vss_byelem, p, m, c), \ __vss(virtchnl_tc_info, __vss_byelem, p, m, c), \ __vss(virtchnl_rdma_qvlist_info, __vss_byelem, p, m, c), \ + __vss(virtchnl_qos_cap_list, __vss_byelem, p, m, c), \ + __vss(virtchnl_queues_bw_cfg, __vss_byelem, p, m, c), \ __vss(virtchnl_rss_key, __vss_byone, p, m, c), \ __vss(virtchnl_rss_lut, __vss_byone, p, m, c)) @@ -1637,6 +1727,35 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2: valid_len = sizeof(struct virtchnl_vlan_setting); break; + case VIRTCHNL_OP_GET_QOS_CAPS: + break; + case VIRTCHNL_OP_CONFIG_QUEUE_BW: + valid_len = virtchnl_queues_bw_cfg_LEGACY_SIZEOF; + if (msglen >= valid_len) { + struct virtchnl_queues_bw_cfg *q_bw = + (struct virtchnl_queues_bw_cfg *)msg; + + valid_len = virtchnl_struct_size(q_bw, cfg, + q_bw->num_queues); + if (q_bw->num_queues == 0) { + err_msg_format = true; + break; + } + } + break; + case VIRTCHNL_OP_CONFIG_QUANTA: + valid_len = sizeof(struct virtchnl_quanta_cfg); + if (msglen >= valid_len) { + struct virtchnl_quanta_cfg *q_quanta = + (struct virtchnl_quanta_cfg *)msg; + + if (q_quanta->quanta_size == 0 || + q_quanta->queue_select.num_queues == 0) { + err_msg_format = true; + break; + } + } + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: -- cgit v1.2.3 From 0a6c61bc9c636e9a32d9f5a4d6d3b031d08763ab Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 10 Oct 2024 23:59:09 +0900 Subject: fgraph: Simplify return address printing in function graph tracer Simplify return address printing in the function graph tracer by removing fgraph_extras. Since this feature is only used by the function graph tracer and the feature flags can directly accessible from the function graph tracer, fgraph_extras can be removed from the fgraph callback. Cc: Donglin Peng Link: https://lore.kernel.org/172857234900.270774.15378354017601069781.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2ac3b3b53cd0..4c7dd5e58c9f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1068,29 +1068,20 @@ struct ftrace_graph_ret { unsigned long long rettime; } __packed; -struct fgraph_extras; struct fgraph_ops; /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, struct fgraph_ops *); /* return */ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, - struct fgraph_ops *, - struct fgraph_extras *); /* entry */ + struct fgraph_ops *); /* entry */ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct fgraph_extras *extras); + struct fgraph_ops *gops); bool ftrace_pids_enabled(struct ftrace_ops *ops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER -/* Used to convey some extra datas when creating a graph entry */ -struct fgraph_extras { - u32 flags; - unsigned long retaddr; -}; - struct fgraph_ops { trace_func_graph_ent_t entryfunc; trace_func_graph_ret_t retfunc; @@ -1131,13 +1122,12 @@ function_graph_enter(unsigned long ret, unsigned long func, struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int skip); +unsigned long ftrace_graph_top_ret_addr(struct task_struct *task); unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); unsigned long *fgraph_get_task_var(struct fgraph_ops *gops); -u32 graph_tracer_flags_get(u32 flags); - /* * Sometimes we don't want to trace a function with the function * graph tracer but we want them to keep traced by the usual function -- cgit v1.2.3 From bafffd56c608106d11e7aec851f114dcd66b2091 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 10 Oct 2024 14:54:46 +0100 Subject: clocksource: Remove unused clocksource_change_rating clocksource_change_rating() has been unused since 2017's commit 63ed4e0c67df ("Drivers: hv: vmbus: Consolidate all Hyper-V specific clocksource code") Remove it. __clocksource_change_rating now only has one use which is ifdef'd. Move it into the ifdef'd section. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241010135446.213098-1-linux@treblig.org --- include/linux/clocksource.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index d35b677b08fe..ef1b16da6ad5 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -215,7 +215,6 @@ static inline s64 clocksource_cyc2ns(u64 cycles, u32 mult, u32 shift) extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); -extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); extern struct clocksource * __init clocksource_default_clock(void); -- cgit v1.2.3 From 445936f9e258eca624c8239056bd8cd6e853b3fd Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 23 Sep 2024 11:59:57 +0200 Subject: thermal: core: Add user thresholds support The user thresholds mechanism is a way to have the userspace to tell the thermal framework to send a notification when a temperature limit is crossed. There is no id, no hysteresis, just the temperature and the direction of the limit crossing. That means we can be notified when a threshold is crossed the way up only, or the way down only or both ways. That allows to create hysteresis values if it is needed. A threshold can be added, deleted or flushed. The latter means all thresholds belonging to a thermal zone will be deleted. When a threshold is added: - if the same threshold (temperature and direction) exists, an error is returned - if a threshold is specified with the same temperature but a different direction, the specified direction is added - if there is no threshold with the same temperature then it is created When a threshold is deleted: - if the same threshold (temperature and direction) exists, it is deleted - if a threshold is specified with the same temperature but a different direction, the specified direction is removed - if there is no threshold with the same temperature, then an error is returned When the threshold are flushed: - All thresholds related to a thermal zone are deleted When a threshold is crossed: - the userspace does not need to know which threshold(s) have been crossed, it will be notified with the current temperature and the previous temperature - if multiple thresholds have been crossed between two updates only one notification will be send to the userspace, it is pointless to send a notification per thresholds crossed as the userspace can handle that easily when it has the temperature delta information Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20240923100005.2532430-2-daniel.lezcano@linaro.org [ rjw: Subject edit, use BIT(0) and BIT(1) in symbol definitions ] Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 25ea8fe2313e..bcaa92732e14 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -56,6 +56,9 @@ enum thermal_notify_event { THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */ THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */ THERMAL_TZ_RESUME, /* Thermal zone is resuming after system sleep */ + THERMAL_TZ_ADD_THRESHOLD, /* Threshold added */ + THERMAL_TZ_DEL_THRESHOLD, /* Threshold deleted */ + THERMAL_TZ_FLUSH_THRESHOLDS, /* All thresholds deleted */ }; /** -- cgit v1.2.3 From 9fb6fef0fb49124291837af1da5028f79d53f98e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 14 Jun 2024 13:06:03 +0300 Subject: resource: Add resource set range and size helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setting the end address for a resource with a given size lacks a helper and is therefore coded manually unlike the getter side which has a helper for resource size calculation. Also, almost all callsites that calculate the end address for a resource also set the start address right before it like this: res->start = start_addr; res->end = res->start + size - 1; Add resource_set_range(res, start_addr, size) that sets the start address and calculates the end address to simplify this often repeated fragment. Also add resource_set_size() for the cases where setting the start address of the resource is not necessary but mention in its kerneldoc that resource_set_range() is preferred when setting both addresses. Link: https://lore.kernel.org/r/20240614100606.15830-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron --- include/linux/ioport.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 6e9fb667a1c5..5385349f0b8a 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -249,6 +249,38 @@ struct resource *lookup_resource(struct resource *root, resource_size_t start); int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size); resource_size_t resource_alignment(struct resource *res); + +/** + * resource_set_size - Calculate resource end address from size and start + * @res: Resource descriptor + * @size: Size of the resource + * + * Calculate the end address for @res based on @size. + * + * Note: The start address of @res must be set when calling this function. + * Prefer resource_set_range() if setting both the start address and @size. + */ +static inline void resource_set_size(struct resource *res, resource_size_t size) +{ + res->end = res->start + size - 1; +} + +/** + * resource_set_range - Set resource start and end addresses + * @res: Resource descriptor + * @start: Start address for the resource + * @size: Size of the resource + * + * Set @res start address and calculate the end address based on @size. + */ +static inline void resource_set_range(struct resource *res, + resource_size_t start, + resource_size_t size) +{ + res->start = start; + resource_set_size(res, size); +} + static inline resource_size_t resource_size(const struct resource *res) { return res->end - res->start + 1; -- cgit v1.2.3 From 7888af4166d4ab07ba51234be6ba332b7807e901 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Oct 2024 19:05:28 -0400 Subject: ftrace: Make ftrace_regs abstract from direct use ftrace_regs was created to hold registers that store information to save function parameters, return value and stack. Since it is a subset of pt_regs, it should only be used by its accessor functions. But because pt_regs can easily be taken from ftrace_regs (on most archs), it is tempting to use it directly. But when running on other architectures, it may fail to build or worse, build but crash the kernel! Instead, make struct ftrace_regs an empty structure and have the architectures define __arch_ftrace_regs and all the accessor functions will typecast to it to get to the actual fields. This will help avoid usage of ftrace_regs directly. Link: https://lore.kernel.org/all/20241007171027.629bdafd@gandalf.local.home/ Cc: "linux-arch@vger.kernel.org" Cc: "x86@kernel.org" Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Will Deacon Cc: Huacai Chen Cc: WANG Xuerui Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Link: https://lore.kernel.org/20241008230628.958778821@goodmis.org Acked-by: Catalin Marinas Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens # s390 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4c7dd5e58c9f..66f10291a0b2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -115,8 +115,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val extern int ftrace_enabled; -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS - /** * ftrace_regs - ftrace partial/optimal register set * @@ -142,11 +140,28 @@ extern int ftrace_enabled; * * NOTE: user *must not* access regs directly, only do it via APIs, because * the member can be changed according to the architecture. + * This is why the structure is empty here, so that nothing accesses + * the ftrace_regs directly. */ struct ftrace_regs { + /* Nothing to see here, use the accessor functions! */ +}; + +#define ftrace_regs_size() sizeof(struct __arch_ftrace_regs) + +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + +struct __arch_ftrace_regs { struct pt_regs regs; }; -#define arch_ftrace_get_regs(fregs) (&(fregs)->regs) + +struct ftrace_regs; +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) + +static inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + return &arch_ftrace_regs(fregs)->regs; +} /* * ftrace_regs_set_instruction_pointer() is to be defined by the architecture -- cgit v1.2.3 From 483c5c2bc6b1c0d17214e2672032def605ff2ba9 Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Fri, 20 Sep 2024 13:34:23 +0800 Subject: serial: clean up uart_info Since commit ebd2c8f6d2ec ("serial: kill off uart_info") has removed uart_info, the uart_info declaration looks lonely, let it go. Signed-off-by: Yanteng Si Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240920053423.1373354-1-siyanteng@cqsoftware.com.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/sa11x0-serial.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/sa11x0-serial.h b/include/linux/platform_data/sa11x0-serial.h index 8b79ab08af45..a88096bc74e4 100644 --- a/include/linux/platform_data/sa11x0-serial.h +++ b/include/linux/platform_data/sa11x0-serial.h @@ -10,7 +10,6 @@ #define SA11X0_SERIAL_H struct uart_port; -struct uart_info; /* * This is a temporary structure for registering these -- cgit v1.2.3 From 7738a7ab9d12c5371ed97114ee2132d4512e9fd5 Mon Sep 17 00:00:00 2001 From: Parker Newman Date: Wed, 2 Oct 2024 11:12:33 -0400 Subject: misc: eeprom: eeprom_93cx6: Add quirk for extra read clock cycle Add a quirk similar to eeprom_93xx46 to add an extra clock cycle before reading data from the EEPROM. The 93Cx6 family of EEPROMs output a "dummy 0 bit" between the writing of the op-code/address from the host to the EEPROM and the reading of the actual data from the EEPROM. More info can be found on page 6 of the AT93C46 datasheet (linked below). Similar notes are found in other 93xx6 datasheets. In summary the read operation for a 93Cx6 EEPROM is: Write to EEPROM: 110[A5-A0] (9 bits) Read from EEPROM: 0[D15-D0] (17 bits) Where: 110 is the start bit and READ OpCode [A5-A0] is the address to read from 0 is a "dummy bit" preceding the actual data [D15-D0] is the actual data. Looking at the READ timing diagrams in the 93Cx6 datasheets the dummy bit should be clocked out on the last address bit clock cycle meaning it should be discarded naturally. However, depending on the hardware configuration sometimes this dummy bit is not discarded. This is the case with Exar PCI UARTs which require an extra clock cycle between sending the address and reading the data. Datasheet: https://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-5193-SEEPROM-AT93C46D-Datasheet.pdf Reviewed-by: Andy Shevchenko Signed-off-by: Parker Newman Link: https://lore.kernel.org/r/0f23973efefccd2544705a0480b4ad4c2353e407.1727880931.git.pnewman@connecttech.com Signed-off-by: Greg Kroah-Hartman --- include/linux/eeprom_93cx6.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/eeprom_93cx6.h b/include/linux/eeprom_93cx6.h index c860c72a921d..3a485cc0e0fa 100644 --- a/include/linux/eeprom_93cx6.h +++ b/include/linux/eeprom_93cx6.h @@ -11,6 +11,8 @@ Supported chipsets: 93c46, 93c56 and 93c66. */ +#include + /* * EEPROM operation defines. */ @@ -34,6 +36,7 @@ * @register_write(struct eeprom_93cx6 *eeprom): handler to * write to the eeprom register by using all reg_* fields. * @width: eeprom width, should be one of the PCI_EEPROM_WIDTH_* defines + * @quirks: eeprom or controller quirks * @drive_data: Set if we're driving the data line. * @reg_data_in: register field to indicate data input * @reg_data_out: register field to indicate data output @@ -50,6 +53,9 @@ struct eeprom_93cx6 { void (*register_write)(struct eeprom_93cx6 *eeprom); int width; + unsigned int quirks; +/* Some EEPROMs require an extra clock cycle before reading */ +#define PCI_EEPROM_QUIRK_EXTRA_READ_CYCLE BIT(0) char drive_data; char reg_data_in; @@ -71,3 +77,8 @@ extern void eeprom_93cx6_wren(struct eeprom_93cx6 *eeprom, bool enable); extern void eeprom_93cx6_write(struct eeprom_93cx6 *eeprom, u8 addr, u16 data); + +static inline bool has_quirk_extra_read_cycle(struct eeprom_93cx6 *eeprom) +{ + return eeprom->quirks & PCI_EEPROM_QUIRK_EXTRA_READ_CYCLE; +} -- cgit v1.2.3 From 11dad94b50263bbe87d015041c77be61c8c44161 Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Thu, 10 Oct 2024 12:13:55 +0300 Subject: phy: sun4i-usb: Fix a typo Fix a typo in comments: wether -> whether. Signed-off-by: Andrew Kreimer Acked-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20241010091355.8271-1-algonell@gmail.com Signed-off-by: Vinod Koul --- include/linux/phy/phy-sun4i-usb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy/phy-sun4i-usb.h b/include/linux/phy/phy-sun4i-usb.h index 91eb755ee73b..f3e7b13608e4 100644 --- a/include/linux/phy/phy-sun4i-usb.h +++ b/include/linux/phy/phy-sun4i-usb.h @@ -11,7 +11,7 @@ /** * sun4i_usb_phy_set_squelch_detect() - Enable/disable squelch detect * @phy: reference to a sun4i usb phy - * @enabled: wether to enable or disable squelch detect + * @enabled: whether to enable or disable squelch detect */ void sun4i_usb_phy_set_squelch_detect(struct phy *phy, bool enabled); -- cgit v1.2.3 From ed870e35db660724ff0d815d9a3ef9a6247ffbab Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:09 -0700 Subject: lsm: add the lsm_prop data structure When more than one security module is exporting data to audit and networking sub-systems a single 32 bit integer is no longer sufficient to represent the data. Add a structure to be used instead. The lsm_prop structure definition is intended to keep the LSM specific information private to the individual security modules. The module specific information is included in a new set of header files under include/lsm. Each security module is allowed to define the information included for its use in the lsm_prop. SELinux includes a u32 secid. Smack includes a pointer into its global label list. The conditional compilation based on feature inclusion is contained in the include/lsm files. Cc: apparmor@lists.ubuntu.com Cc: bpf@vger.kernel.org Cc: selinux@vger.kernel.org Cc: linux-security-module@vger.kernel.org Suggested-by: Paul Moore Signed-off-by: Casey Schaufler Acked-by: John Johansen [PM: added include/linux/lsm/ to MAINTAINERS, subj tweak] Signed-off-by: Paul Moore --- include/linux/lsm/apparmor.h | 17 +++++++++++++++++ include/linux/lsm/bpf.h | 16 ++++++++++++++++ include/linux/lsm/selinux.h | 16 ++++++++++++++++ include/linux/lsm/smack.h | 17 +++++++++++++++++ include/linux/security.h | 20 ++++++++++++++++++++ 5 files changed, 86 insertions(+) create mode 100644 include/linux/lsm/apparmor.h create mode 100644 include/linux/lsm/bpf.h create mode 100644 include/linux/lsm/selinux.h create mode 100644 include/linux/lsm/smack.h (limited to 'include/linux') diff --git a/include/linux/lsm/apparmor.h b/include/linux/lsm/apparmor.h new file mode 100644 index 000000000000..612cbfacb072 --- /dev/null +++ b/include/linux/lsm/apparmor.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Linux Security Module interface to other subsystems. + * AppArmor presents single pointer to an aa_label structure. + */ +#ifndef __LINUX_LSM_APPARMOR_H +#define __LINUX_LSM_APPARMOR_H + +struct aa_label; + +struct lsm_prop_apparmor { +#ifdef CONFIG_SECURITY_APPARMOR + struct aa_label *label; +#endif +}; + +#endif /* ! __LINUX_LSM_APPARMOR_H */ diff --git a/include/linux/lsm/bpf.h b/include/linux/lsm/bpf.h new file mode 100644 index 000000000000..8106e206fcef --- /dev/null +++ b/include/linux/lsm/bpf.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Linux Security Module interface to other subsystems. + * BPF may present a single u32 value. + */ +#ifndef __LINUX_LSM_BPF_H +#define __LINUX_LSM_BPF_H +#include + +struct lsm_prop_bpf { +#ifdef CONFIG_BPF_LSM + u32 secid; +#endif +}; + +#endif /* ! __LINUX_LSM_BPF_H */ diff --git a/include/linux/lsm/selinux.h b/include/linux/lsm/selinux.h new file mode 100644 index 000000000000..9455a6b5b910 --- /dev/null +++ b/include/linux/lsm/selinux.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Linux Security Module interface to other subsystems. + * SELinux presents a single u32 value which is known as a secid. + */ +#ifndef __LINUX_LSM_SELINUX_H +#define __LINUX_LSM_SELINUX_H +#include + +struct lsm_prop_selinux { +#ifdef CONFIG_SECURITY_SELINUX + u32 secid; +#endif +}; + +#endif /* ! __LINUX_LSM_SELINUX_H */ diff --git a/include/linux/lsm/smack.h b/include/linux/lsm/smack.h new file mode 100644 index 000000000000..ff730dd7a734 --- /dev/null +++ b/include/linux/lsm/smack.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Linux Security Module interface to other subsystems. + * Smack presents a pointer into the global Smack label list. + */ +#ifndef __LINUX_LSM_SMACK_H +#define __LINUX_LSM_SMACK_H + +struct smack_known; + +struct lsm_prop_smack { +#ifdef CONFIG_SECURITY_SMACK + struct smack_known *skp; +#endif +}; + +#endif /* ! __LINUX_LSM_SMACK_H */ diff --git a/include/linux/security.h b/include/linux/security.h index b86ec2afc691..555249a8d121 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -34,6 +34,10 @@ #include #include #include +#include +#include +#include +#include struct linux_binprm; struct cred; @@ -152,6 +156,22 @@ enum lockdown_reason { LOCKDOWN_CONFIDENTIALITY_MAX, }; +/* scaffolding */ +struct lsm_prop_scaffold { + u32 secid; +}; + +/* + * Data exported by the security modules + */ +struct lsm_prop { + struct lsm_prop_selinux selinux; + struct lsm_prop_smack smack; + struct lsm_prop_apparmor apparmor; + struct lsm_prop_bpf bpf; + struct lsm_prop_scaffold scaffold; +}; + extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1]; extern u32 lsm_active_cnt; extern const struct lsm_id *lsm_idlist[]; -- cgit v1.2.3 From 870b7fdc660b38c4e1bd8bf48e62aa352ddf8f42 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:10 -0700 Subject: lsm: use lsm_prop in security_audit_rule_match Change the secid parameter of security_audit_rule_match to a lsm_prop structure pointer. Pass the entry from the lsm_prop structure for the approprite slot to the LSM hook. Change the users of security_audit_rule_match to use the lsm_prop instead of a u32. The scaffolding function lsmprop_init() fills the structure with the value of the old secid, ensuring that it is available to the appropriate module hook. The sources of the secid, security_task_getsecid() and security_inode_getsecid(), will be converted to use the lsm_prop structure later in the series. At that point the use of lsmprop_init() is dropped. Signed-off-by: Casey Schaufler [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 ++- include/linux/security.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 9eca013aa5e1..ea7f17e37756 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -416,7 +416,8 @@ LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring, LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr, void **lsmrule, gfp_t gfp) LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule) -LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule) +LSM_HOOK(int, 0, audit_rule_match, struct lsm_prop *prop, u32 field, u32 op, + void *lsmrule) LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) #endif /* CONFIG_AUDIT */ diff --git a/include/linux/security.h b/include/linux/security.h index 555249a8d121..a4f020491e7c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2115,7 +2115,8 @@ static inline void security_key_post_create_or_update(struct key *keyring, int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, gfp_t gfp); int security_audit_rule_known(struct audit_krule *krule); -int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule); +int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, + void *lsmrule); void security_audit_rule_free(void *lsmrule); #else @@ -2131,8 +2132,8 @@ static inline int security_audit_rule_known(struct audit_krule *krule) return 0; } -static inline int security_audit_rule_match(u32 secid, u32 field, u32 op, - void *lsmrule) +static inline int security_audit_rule_match(struct lsm_prop *prop, u32 field, + u32 op, void *lsmrule) { return 0; } -- cgit v1.2.3 From 6f2f724f0e116d9ea960ff3dd645add12e60e176 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:11 -0700 Subject: lsm: add lsmprop_to_secctx hook Add a new hook security_lsmprop_to_secctx() and its LSM specific implementations. The LSM specific code will use the lsm_prop element allocated for that module. This allows for the possibility that more than one module may be called upon to translate a secid to a string, as can occur in the audit code. Signed-off-by: Casey Schaufler [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ea7f17e37756..ed6ea0b1ec57 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -294,6 +294,8 @@ LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, u32 *seclen) +LSM_HOOK(int, -EOPNOTSUPP, lsmprop_to_secctx, struct lsm_prop *prop, + char **secdata, u32 *seclen) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) diff --git a/include/linux/security.h b/include/linux/security.h index a4f020491e7c..f1c68e38b15d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -535,6 +535,7 @@ int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_netlink_send(struct sock *sk, struct sk_buff *skb); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); +int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata, u32 *seclen); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(char *secdata, u32 seclen); void security_inode_invalidate_secctx(struct inode *inode); @@ -1488,7 +1489,14 @@ static inline int security_ismaclabel(const char *name) return 0; } -static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) +static inline int security_secid_to_secctx(u32 secid, char **secdata, + u32 *seclen) +{ + return -EOPNOTSUPP; +} + +static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, + char **secdata, u32 *seclen) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 7183abccd8ac2c486363e267b5d84032818eb725 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:12 -0700 Subject: audit: maintain an lsm_prop in audit_context Replace the secid value stored in struct audit_context with a struct lsm_prop. Change the code that uses this value to accommodate the change. security_audit_rule_match() expects a lsm_prop, so existing scaffolding can be removed. A call to security_secid_to_secctx() is changed to security_lsmprop_to_secctx(). The call to security_ipc_getsecid() is scaffolded. A new function lsmprop_is_set() is introduced to identify whether an lsm_prop contains a non-zero value. Signed-off-by: Casey Schaufler [PM: subject line tweak, fix lsmprop_is_set() typo] Signed-off-by: Paul Moore --- include/linux/security.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index f1c68e38b15d..c029bfe2c5bb 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -291,6 +291,19 @@ static inline const char *kernel_load_data_id_str(enum kernel_load_data_id id) #ifdef CONFIG_SECURITY +/** + * lsmprop_is_set - report if there is a value in the lsm_prop + * @prop: Pointer to the exported LSM data + * + * Returns true if there is a value set, false otherwise + */ +static inline bool lsmprop_is_set(struct lsm_prop *prop) +{ + const struct lsm_prop empty = {}; + + return !!memcmp(prop, &empty, sizeof(*prop)); +} + int call_blocking_lsm_notifier(enum lsm_event event, void *data); int register_blocking_lsm_notifier(struct notifier_block *nb); int unregister_blocking_lsm_notifier(struct notifier_block *nb); @@ -552,6 +565,17 @@ int security_bdev_setintegrity(struct block_device *bdev, size_t size); #else /* CONFIG_SECURITY */ +/** + * lsmprop_is_set - report if there is a value in the lsm_prop + * @prop: Pointer to the exported LSM data + * + * Returns true if there is a value set, false otherwise + */ +static inline bool lsmprop_is_set(struct lsm_prop *prop) +{ + return false; +} + static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data) { return 0; -- cgit v1.2.3 From f4602f163c98bc93c118e196466c1c98186adb67 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:13 -0700 Subject: lsm: use lsm_prop in security_ipc_getsecid There may be more than one LSM that provides IPC data for auditing. Change security_ipc_getsecid() to fill in a lsm_prop structure instead of the u32 secid. Change the name to security_ipc_getlsmprop() to reflect the change. Cc: audit@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: selinux@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/security.h | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ed6ea0b1ec57..6ef2a345ea03 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -256,8 +256,8 @@ LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p, struct inode *inode) LSM_HOOK(int, 0, userns_create, const struct cred *cred) LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag) -LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp, - u32 *secid) +LSM_HOOK(void, LSM_RET_VOID, ipc_getlsmprop, struct kern_ipc_perm *ipcp, + struct lsm_prop *prop) LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg) LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg) LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm) diff --git a/include/linux/security.h b/include/linux/security.h index c029bfe2c5bb..15aef5f68e77 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -289,6 +289,17 @@ static inline const char *kernel_load_data_id_str(enum kernel_load_data_id id) return kernel_load_data_str[id]; } +/** + * lsmprop_init - initialize a lsm_prop structure + * @prop: Pointer to the data to initialize + * + * Set all secid for all modules to the specified value. + */ +static inline void lsmprop_init(struct lsm_prop *prop) +{ + memset(prop, 0, sizeof(*prop)); +} + #ifdef CONFIG_SECURITY /** @@ -515,7 +526,7 @@ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, void security_task_to_inode(struct task_struct *p, struct inode *inode); int security_create_user_ns(const struct cred *cred); int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag); -void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid); +void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop); int security_msg_msg_alloc(struct msg_msg *msg); void security_msg_msg_free(struct msg_msg *msg); int security_msg_queue_alloc(struct kern_ipc_perm *msq); @@ -1377,9 +1388,10 @@ static inline int security_ipc_permission(struct kern_ipc_perm *ipcp, return 0; } -static inline void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) +static inline void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, + struct lsm_prop *prop) { - *secid = 0; + lsmprop_init(prop); } static inline int security_msg_msg_alloc(struct msg_msg *msg) -- cgit v1.2.3 From 37f670aacd481128ad9a940ac2d3372aecd92824 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:15 -0700 Subject: lsm: use lsm_prop in security_current_getsecid Change the security_current_getsecid_subj() and security_task_getsecid_obj() interfaces to fill in a lsm_prop structure instead of a u32 secid. Audit interfaces will need to collect all possible security data for possible reporting. Cc: linux-integrity@vger.kernel.org Cc: audit@vger.kernel.org Cc: selinux@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 6 +++--- include/linux/security.h | 13 +++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 6ef2a345ea03..8a90fd9ff3c8 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -235,9 +235,9 @@ LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old) LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid) LSM_HOOK(int, 0, task_getpgid, struct task_struct *p) LSM_HOOK(int, 0, task_getsid, struct task_struct *p) -LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid) -LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj, - struct task_struct *p, u32 *secid) +LSM_HOOK(void, LSM_RET_VOID, current_getlsmprop_subj, struct lsm_prop *prop) +LSM_HOOK(void, LSM_RET_VOID, task_getlsmprop_obj, + struct task_struct *p, struct lsm_prop *prop) LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice) LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio) LSM_HOOK(int, 0, task_getioprio, struct task_struct *p) diff --git a/include/linux/security.h b/include/linux/security.h index 15aef5f68e77..9bc8153f4e8b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -507,8 +507,8 @@ int security_task_fix_setgroups(struct cred *new, const struct cred *old); int security_task_setpgid(struct task_struct *p, pid_t pgid); int security_task_getpgid(struct task_struct *p); int security_task_getsid(struct task_struct *p); -void security_current_getsecid_subj(u32 *secid); -void security_task_getsecid_obj(struct task_struct *p, u32 *secid); +void security_current_getlsmprop_subj(struct lsm_prop *prop); +void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop); int security_task_setnice(struct task_struct *p, int nice); int security_task_setioprio(struct task_struct *p, int ioprio); int security_task_getioprio(struct task_struct *p); @@ -1305,14 +1305,15 @@ static inline int security_task_getsid(struct task_struct *p) return 0; } -static inline void security_current_getsecid_subj(u32 *secid) +static inline void security_current_getlsmprop_subj(struct lsm_prop *prop) { - *secid = 0; + lsmprop_init(prop); } -static inline void security_task_getsecid_obj(struct task_struct *p, u32 *secid) +static inline void security_task_getlsmprop_obj(struct task_struct *p, + struct lsm_prop *prop) { - *secid = 0; + lsmprop_init(prop); } static inline int security_task_setnice(struct task_struct *p, int nice) -- cgit v1.2.3 From 07f9d2c1132c9b838538b606dfcdab2506cd2ae4 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:16 -0700 Subject: lsm: use lsm_prop in security_inode_getsecid Change the security_inode_getsecid() interface to fill in a lsm_prop structure instead of a u32 secid. This allows for its callers to gather data from all registered LSMs. Data is provided for IMA and audit. Change the name to security_inode_getlsmprop(). Cc: linux-integrity@vger.kernel.org Cc: selinux@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subj line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 ++- include/linux/security.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 8a90fd9ff3c8..23ad7e4f8c67 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -176,7 +176,8 @@ LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode, const char *name, const void *value, size_t size, int flags) LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer, size_t buffer_size) -LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid) +LSM_HOOK(void, LSM_RET_VOID, inode_getlsmprop, struct inode *inode, + struct lsm_prop *prop) LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new) LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, struct dentry *src, const char *name) diff --git a/include/linux/security.h b/include/linux/security.h index 9bc8153f4e8b..2b19ef5d799c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -452,7 +452,7 @@ int security_inode_getsecurity(struct mnt_idmap *idmap, void **buffer, bool alloc); int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags); int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size); -void security_inode_getsecid(struct inode *inode, u32 *secid); +void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop); int security_inode_copy_up(struct dentry *src, struct cred **new); int security_inode_copy_up_xattr(struct dentry *src, const char *name); int security_inode_setintegrity(const struct inode *inode, @@ -1076,9 +1076,10 @@ static inline int security_inode_listsecurity(struct inode *inode, char *buffer, return 0; } -static inline void security_inode_getsecid(struct inode *inode, u32 *secid) +static inline void security_inode_getlsmprop(struct inode *inode, + struct lsm_prop *prop) { - *secid = 0; + lsmprop_init(prop); } static inline int security_inode_copy_up(struct dentry *src, struct cred **new) -- cgit v1.2.3 From b0654ca42998440df42ba2ccc3b7dbe3bf5b7bb5 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:18 -0700 Subject: lsm: create new security_cred_getlsmprop LSM hook Create a new LSM hook security_cred_getlsmprop() which, like security_cred_getsecid(), fetches LSM specific attributes from the cred structure. The associated data elements in the audit sub-system are changed from a secid to a lsm_prop to accommodate multiple possible LSM audit users. Cc: linux-integrity@vger.kernel.org Cc: audit@vger.kernel.org Cc: selinux@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subj line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 23ad7e4f8c67..eb2937599cb0 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -218,6 +218,8 @@ LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old, LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new, const struct cred *old) LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid) +LSM_HOOK(void, LSM_RET_VOID, cred_getlsmprop, const struct cred *c, + struct lsm_prop *prop) LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid) LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode) LSM_HOOK(int, 0, kernel_module_request, char *kmod_name) diff --git a/include/linux/security.h b/include/linux/security.h index 2b19ef5d799c..acd2e5d1b0ff 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -488,6 +488,7 @@ void security_cred_free(struct cred *cred); int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); void security_transfer_creds(struct cred *new, const struct cred *old); void security_cred_getsecid(const struct cred *c, u32 *secid); +void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop); int security_kernel_act_as(struct cred *new, u32 secid); int security_kernel_create_files_as(struct cred *new, struct inode *inode); int security_kernel_module_request(char *kmod_name); @@ -1229,6 +1230,10 @@ static inline void security_cred_getsecid(const struct cred *c, u32 *secid) *secid = 0; } +static inline void security_cred_getlsmprop(const struct cred *c, + struct lsm_prop *prop) +{ } + static inline int security_kernel_act_as(struct cred *cred, u32 secid) { return 0; -- cgit v1.2.3 From 8afd8c8faa24249e48f5007aee46209299377588 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:21 -0700 Subject: lsm: remove lsm_prop scaffolding Remove the scaffold member from the lsm_prop. Remove the remaining places it is being set. Signed-off-by: Casey Schaufler [PM: subj line tweak] Signed-off-by: Paul Moore --- include/linux/security.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index acd2e5d1b0ff..fd690fa73162 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -156,11 +156,6 @@ enum lockdown_reason { LOCKDOWN_CONFIDENTIALITY_MAX, }; -/* scaffolding */ -struct lsm_prop_scaffold { - u32 secid; -}; - /* * Data exported by the security modules */ @@ -169,7 +164,6 @@ struct lsm_prop { struct lsm_prop_smack smack; struct lsm_prop_apparmor apparmor; struct lsm_prop_bpf bpf; - struct lsm_prop_scaffold scaffold; }; extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1]; -- cgit v1.2.3 From 8fa714ca334e0880665a14fed13b16e3e01a67b2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 10 Oct 2024 21:15:35 +0300 Subject: iio: Convert unsigned to unsigned int Simple type conversion with no functional change implied. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20241010181535.3083262-1-andriy.shevchenko@linux.intel.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-opaque.h | 2 +- include/linux/iio/iio.h | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index 5aec3945555b..a89e7e43e441 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -70,7 +70,7 @@ struct iio_dev_opaque { #if defined(CONFIG_DEBUG_FS) struct dentry *debugfs_dentry; - unsigned cached_reg_addr; + unsigned int cached_reg_addr; char read_buf[20]; unsigned int read_buf_len; #endif diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 18779b631e90..3a9b57187a95 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -282,11 +282,11 @@ struct iio_chan_spec { const struct iio_chan_spec_ext_info *ext_info; const char *extend_name; const char *datasheet_name; - unsigned modified:1; - unsigned indexed:1; - unsigned output:1; - unsigned differential:1; - unsigned has_ext_scan_type:1; + unsigned int modified:1; + unsigned int indexed:1; + unsigned int output:1; + unsigned int differential:1; + unsigned int has_ext_scan_type:1; }; @@ -541,13 +541,13 @@ struct iio_info { int (*update_scan_mode)(struct iio_dev *indio_dev, const unsigned long *scan_mask); int (*debugfs_reg_access)(struct iio_dev *indio_dev, - unsigned reg, unsigned writeval, - unsigned *readval); + unsigned int reg, unsigned int writeval, + unsigned int *readval); int (*fwnode_xlate)(struct iio_dev *indio_dev, const struct fwnode_reference_args *iiospec); - int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned val); + int (*hwfifo_set_watermark)(struct iio_dev *indio_dev, unsigned int val); int (*hwfifo_flush_to_buffer)(struct iio_dev *indio_dev, - unsigned count); + unsigned int count); }; /** @@ -609,7 +609,7 @@ struct iio_dev { int scan_bytes; const unsigned long *available_scan_masks; - unsigned __private masklength; + unsigned int __private masklength; const unsigned long *active_scan_mask; bool scan_timestamp; struct iio_trigger *trig; -- cgit v1.2.3 From 454bbde8f0d465e93e5a3a4003ac6c7e62fa4473 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:19 +0800 Subject: net: skb: add pskb_network_may_pull_reason() helper Introduce the function pskb_network_may_pull_reason() and make pskb_network_may_pull() a simple inline call to it. The drop reasons of it just come from pskb_may_pull_reason. Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 39f1d16f3628..48f1e0fa2a13 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3130,9 +3130,15 @@ static inline int skb_inner_network_offset(const struct sk_buff *skb) return skb_inner_network_header(skb) - skb->data; } +static inline enum skb_drop_reason +pskb_network_may_pull_reason(struct sk_buff *skb, unsigned int len) +{ + return pskb_may_pull_reason(skb, skb_network_offset(skb) + len); +} + static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) { - return pskb_may_pull(skb, skb_network_offset(skb) + len); + return pskb_network_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET; } /* -- cgit v1.2.3 From 7948483001039c00dcff4b94c181d7e14693a38c Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 11 Oct 2024 21:12:52 +0200 Subject: misc: keba: Add SPI controller device Add support for the SPI controller auxiliary device. This enables access to the SPI flash of the FPGA and some other SPI devices. The actual list of SPI devices is detected by reading some bits out of the previously registered I2C EEPROM. Signed-off-by: Gerhard Engleder Link: https://lore.kernel.org/r/20241011191257.19702-4-gerhard@engleder-embedded.com Signed-off-by: Greg Kroah-Hartman --- include/linux/misc/keba.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h index 323b31a847c5..1bd5409c6f6f 100644 --- a/include/linux/misc/keba.h +++ b/include/linux/misc/keba.h @@ -7,6 +7,7 @@ #include struct i2c_board_info; +struct spi_board_info; /** * struct keba_i2c_auxdev - KEBA I2C auxiliary device @@ -22,4 +23,18 @@ struct keba_i2c_auxdev { struct i2c_board_info *info; }; +/** + * struct keba_spi_auxdev - KEBA SPI auxiliary device + * @auxdev: auxiliary device object + * @io: address range of SPI controller IO memory + * @info_size: number of SPI devices to be probed + * @info: SPI devices to be probed + */ +struct keba_spi_auxdev { + struct auxiliary_device auxdev; + struct resource io; + int info_size; + struct spi_board_info *info; +}; + #endif /* _LINUX_MISC_KEBA_H */ -- cgit v1.2.3 From f965d315bcbd65adfe5e3c161e46b5dc0a463f68 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 11 Oct 2024 21:12:55 +0200 Subject: misc: keba: Add fan device Add support for the fan auxiliary device. This enables monitoring of the fan. Signed-off-by: Gerhard Engleder Link: https://lore.kernel.org/r/20241011191257.19702-7-gerhard@engleder-embedded.com Signed-off-by: Greg Kroah-Hartman --- include/linux/misc/keba.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h index 1bd5409c6f6f..451777acc262 100644 --- a/include/linux/misc/keba.h +++ b/include/linux/misc/keba.h @@ -37,4 +37,14 @@ struct keba_spi_auxdev { struct spi_board_info *info; }; +/** + * struct keba_fan_auxdev - KEBA fan auxiliary device + * @auxdev: auxiliary device object + * @io: address range of fan controller IO memory + */ +struct keba_fan_auxdev { + struct auxiliary_device auxdev; + struct resource io; +}; + #endif /* _LINUX_MISC_KEBA_H */ -- cgit v1.2.3 From ca7b844b91920573835ad11daaa30630ce112fe1 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 11 Oct 2024 21:12:56 +0200 Subject: misc: keba: Add battery device Add support for the battery auxiliary device. This enables monitoring of the battery. Signed-off-by: Gerhard Engleder Link: https://lore.kernel.org/r/20241011191257.19702-8-gerhard@engleder-embedded.com Signed-off-by: Greg Kroah-Hartman --- include/linux/misc/keba.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h index 451777acc262..ca52716f8437 100644 --- a/include/linux/misc/keba.h +++ b/include/linux/misc/keba.h @@ -47,4 +47,14 @@ struct keba_fan_auxdev { struct resource io; }; +/** + * struct keba_batt_auxdev - KEBA battery auxiliary device + * @auxdev: auxiliary device object + * @io: address range of battery controller IO memory + */ +struct keba_batt_auxdev { + struct auxiliary_device auxdev; + struct resource io; +}; + #endif /* _LINUX_MISC_KEBA_H */ -- cgit v1.2.3 From a27b406a49225a17849c86221a32f2d598702719 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 11 Oct 2024 21:12:57 +0200 Subject: misc: keba: Add UART devices Add support for the UART auxiliary devices. This enables access to up to 3 different UARTs, which are implemented in the FPGA. Signed-off-by: Gerhard Engleder Link: https://lore.kernel.org/r/20241011191257.19702-9-gerhard@engleder-embedded.com Signed-off-by: Greg Kroah-Hartman --- include/linux/misc/keba.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/misc/keba.h b/include/linux/misc/keba.h index ca52716f8437..a81d6fa70851 100644 --- a/include/linux/misc/keba.h +++ b/include/linux/misc/keba.h @@ -57,4 +57,16 @@ struct keba_batt_auxdev { struct resource io; }; +/** + * struct keba_uart_auxdev - KEBA UART auxiliary device + * @auxdev: auxiliary device object + * @io: address range of UART controller IO memory + * @irq: number of UART controller interrupt + */ +struct keba_uart_auxdev { + struct auxiliary_device auxdev; + struct resource io; + unsigned int irq; +}; + #endif /* _LINUX_MISC_KEBA_H */ -- cgit v1.2.3 From cec78a59abc9b1a06f742cb96479fc0bb1fe4e90 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 17 Sep 2024 18:42:56 +0800 Subject: list: Remove duplicated and unused macro list_for_each_reverse Remove macro list_for_each_reverse due to below reasons: - it is same as list_for_each_prev. - it is not used by current kernel tree. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20240917-fix_list-v2-1-d2914665e89f@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/list.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 5f4b0a39cf46..29a375889fb8 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -686,14 +686,6 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) -/** - * list_for_each_reverse - iterate backwards over a list - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. - */ -#define list_for_each_reverse(pos, head) \ - for (pos = (head)->prev; pos != (head); pos = pos->prev) - /** * list_for_each_rcu - Iterate over a list in an RCU-safe fashion * @pos: the &struct list_head to use as a loop cursor. -- cgit v1.2.3 From 0ebe74c53b8b62bde7b02415d28e75aa25d6c2e6 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 29 Sep 2024 15:11:12 +0100 Subject: drivers/base: Remove unused auxiliary_find_device auxiliary_find_device has been unused since commit 1c5de097bea3 ("net/mlx5: Fix mlx5_get_next_dev() peer device matching") which was the only use since it was originally added. Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20240929141112.69824-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/auxiliary_bus.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 31762324bcc9..65dd7f154374 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -269,8 +269,4 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv); #define module_auxiliary_driver(__auxiliary_driver) \ module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister) -struct auxiliary_device *auxiliary_find_device(struct device *start, - const void *data, - device_match_t match); - #endif /* _AUXILIARY_BUS_H_ */ -- cgit v1.2.3 From cd068d51594d9635bf6688fc78717572b78bce6a Mon Sep 17 00:00:00 2001 From: Keita Aihara Date: Fri, 13 Sep 2024 18:44:17 +0900 Subject: mmc: core: Add SD card quirk for broken poweroff notification GIGASTONE Gaming Plus microSD cards manufactured on 02/2022 report that they support poweroff notification and cache, but they are not working correctly. Flush Cache bit never gets cleared in sd_flush_cache() and Poweroff Notification Ready bit also never gets set to 1 within 1 second from the end of busy of CMD49 in sd_poweroff_notify(). This leads to I/O error and runtime PM error state. I observed that the same card manufactured on 01/2024 works as expected. This problem seems similar to the Kingston cards fixed with commit c467c8f08185 ("mmc: Add MMC_QUIRK_BROKEN_SD_CACHE for Kingston Canvas Go Plus from 11/2019") and should be handled using quirks. CID for the problematic card is here. 12345641535443002000000145016200 Manufacturer ID is 0x12 and defined as CID_MANFID_GIGASTONE as of now, but would like comments on what naming is appropriate because MID list is not public and not sure it's right. Signed-off-by: Keita Aihara Link: https://lore.kernel.org/r/20240913094417.GA4191647@sony.com Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index f34407cc2788..543446392776 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -294,6 +294,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_SD_DISCARD (1<<14) /* Disable broken SD discard support */ #define MMC_QUIRK_BROKEN_SD_CACHE (1<<15) /* Disable broken SD cache support */ #define MMC_QUIRK_BROKEN_CACHE_FLUSH (1<<16) /* Don't flush cache until the write has occurred */ +#define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY (1<<17) /* Disable broken SD poweroff notify support */ bool written_flag; /* Indicates eMMC has been written since power on */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From fce2ce78af1e14dc1316aaddb5b3308be05cf452 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Sun, 6 Oct 2024 08:11:39 +0300 Subject: mmc: sd: SDUC Support Recognition Ultra Capacity SD cards (SDUC) was already introduced in SD7.0. Those cards support capacity larger than 2TB and up to including 128TB. ACMD41 was extended to support the host-card handshake during initialization. The card expects that the HCS & HO2T bits to be set in the command argument, and sets the applicable bits in the R3 returned response. On the contrary, if a SDUC card is inserted to a non-supporting host, it will never respond to this ACMD41 until eventually, the host will timed out and give up. Also, add SD CSD version 3.0 - designated for SDUC, and properly parse the csd register as the c_size field got expanded to 28 bits. Do not enable SDUC for now - leave it to the last patch in the series. Tested-by: Ricky WU Reviewed-by: Adrian Hunter Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20241006051148.160278-2-avri.altman@wdc.com Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 2 +- include/linux/mmc/sd.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 543446392776..eb67d3d5ff5b 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -35,7 +35,7 @@ struct mmc_csd { unsigned int wp_grp_size; unsigned int read_blkbits; unsigned int write_blkbits; - unsigned int capacity; + sector_t capacity; unsigned int read_partial:1, read_misalign:1, write_partial:1, diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h index 6727576a8755..865cc0ca8543 100644 --- a/include/linux/mmc/sd.h +++ b/include/linux/mmc/sd.h @@ -36,6 +36,7 @@ /* OCR bit definitions */ #define SD_OCR_S18R (1 << 24) /* 1.8V switching request */ #define SD_ROCR_S18A SD_OCR_S18R /* 1.8V switching accepted by card */ +#define SD_OCR_2T (1 << 27) /* HO2T/CO2T - SDUC support */ #define SD_OCR_XPC (1 << 28) /* SDXC power control */ #define SD_OCR_CCS (1 << 30) /* Card Capacity Status */ -- cgit v1.2.3 From 375b535941bea65b37451f0fd398e28bf4f3bdc3 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Sun, 6 Oct 2024 08:11:40 +0300 Subject: mmc: sd: Add Extension memory addressing SDUC memory addressing spans beyond 2TB and up to 128TB. Therefore, 38 bits are required to access the entire memory space of all sectors. Those extra 6 bits are to be carried by CMD22 prior of sending read/write/erase commands: CMD17, CMD18, CMD24, CMD25, CMD32, and CMD33. CMD22 will carry the higher order 6 bits, and must precedes any of the above commands even if it targets sector < 2TB. No error related to address or length is indicated in CMD22 but rather in the read/write command itself. Tested-by: Ricky WU Reviewed-by: Adrian Hunter Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20241006051148.160278-3-avri.altman@wdc.com Signed-off-by: Ulf Hansson --- include/linux/mmc/sd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sd.h b/include/linux/mmc/sd.h index 865cc0ca8543..af5fc70e09a2 100644 --- a/include/linux/mmc/sd.h +++ b/include/linux/mmc/sd.h @@ -15,6 +15,9 @@ #define SD_SEND_IF_COND 8 /* bcr [11:0] See below R7 */ #define SD_SWITCH_VOLTAGE 11 /* ac R1 */ +/* Class 2 */ +#define SD_ADDR_EXT 22 /* ac [5:0] R1 */ + /* class 10 */ #define SD_SWITCH 6 /* adtc [31:0] See below R1 */ -- cgit v1.2.3 From 403a0293f1c230524e0185b31f69c02a6aed12c7 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Sun, 6 Oct 2024 08:11:42 +0300 Subject: mmc: core: Add open-ended Ext memory addressing For open-ended read/write - just send CMD22 before issuing the command. Reviewed-by: Adrian Hunter Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20241006051148.160278-5-avri.altman@wdc.com Signed-off-by: Ulf Hansson --- include/linux/mmc/core.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index f0ac2e469b32..a890a71288ef 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -96,6 +96,10 @@ struct mmc_command { unsigned int busy_timeout; /* busy detect timeout in ms */ struct mmc_data *data; /* data segment associated with cmd */ struct mmc_request *mrq; /* associated request */ + + /* for SDUC */ + bool has_ext_addr; + u8 ext_addr; }; struct mmc_data { -- cgit v1.2.3 From 79daeb241db7901e4bd53cce9ab046f376a63a4c Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 13 Sep 2024 18:28:16 +0800 Subject: mmc: core: Prepare to support SD UHS-II cards The SD UHS-II interface was introduced to the SD spec v4.00 several years ago. The interface is fundamentally different from an electrical and a protocol point of view, comparing to the legacy SD interface. However, the legacy SD protocol is supported through a specific transport layer (SD-TRAN) defined in the UHS-II addendum of the spec. This allows the SD card to be managed in a very similar way as a legacy SD card, hence a lot of code can be re-used to support these new types of cards through the mmc subsystem. Moreover, an SD card that supports the UHS-II interface shall also be backwards compatible with the legacy SD interface, which allows a UHS-II card to be inserted into a legacy slot. As a matter of fact, this is already supported by mmc subsystem as of today. To prepare to add support for UHS-II, this change puts the basic foundation in the mmc core in place, allowing it to be more easily reviewed before subsequent changes implements the actual support. Basically, the approach here adds a new UHS-II bus_ops type and adds a separate initialization path for the UHS-II card. The intent is to avoid us from sprinkling the legacy initialization path, but also to simplify implementation of the UHS-II specific bits. At this point, there is only one new host ops added to manage the various ios settings needed for UHS-II. Additional host ops that are needed, are being added from subsequent changes. Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20240913102836.6144-3-victorshihgli@gmail.com --- include/linux/mmc/card.h | 7 +++++++ include/linux/mmc/host.h | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index eb67d3d5ff5b..0bb1ad1ea4dc 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -209,6 +209,11 @@ struct sd_ext_reg { #define SD_EXT_PERF_CMD_QUEUE (1<<4) }; +struct sd_uhs2_config { + u32 node_id; + /* TODO: Extend with more register configs. */ +}; + struct sdio_cccr { unsigned int sdio_vsn; unsigned int sd_vsn; @@ -320,6 +325,8 @@ struct mmc_card { struct sd_ext_reg ext_power; /* SD extension reg for PM */ struct sd_ext_reg ext_perf; /* SD extension reg for PERF */ + struct sd_uhs2_config uhs2_config; /* SD UHS-II config */ + unsigned int sdio_funcs; /* number of SDIO functions */ atomic_t sdio_funcs_probed; /* number of probed SDIO funcs */ struct sdio_cccr cccr; /* common card info */ diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 8fc2b328ec4d..e1d380de3aaa 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -64,6 +64,10 @@ struct mmc_ios { #define MMC_TIMING_MMC_HS400 10 #define MMC_TIMING_SD_EXP 11 #define MMC_TIMING_SD_EXP_1_2V 12 +#define MMC_TIMING_UHS2_SPEED_A 13 +#define MMC_TIMING_UHS2_SPEED_A_HD 14 +#define MMC_TIMING_UHS2_SPEED_B 15 +#define MMC_TIMING_UHS2_SPEED_B_HD 16 unsigned char signal_voltage; /* signalling voltage (1.8V or 3.3V) */ @@ -92,6 +96,14 @@ struct mmc_clk_phase_map { struct mmc_clk_phase phase[MMC_NUM_CLK_PHASES]; }; +struct sd_uhs2_caps { + /* TODO: Add UHS-II capabilities for the host. */ +}; + +enum sd_uhs2_operation { + UHS2_SET_IOS, +}; + struct mmc_host; enum mmc_err_stat { @@ -219,6 +231,14 @@ struct mmc_host_ops { /* Initialize an SD express card, mandatory for MMC_CAP2_SD_EXP. */ int (*init_sd_express)(struct mmc_host *host, struct mmc_ios *ios); + + /* + * The uhs2_control callback is used to execute SD UHS-II specific + * operations. It's mandatory to implement for hosts that supports the + * SD UHS-II interface (MMC_CAP2_SD_UHS2). Expected return values are a + * negative errno in case of a failure or zero for success. + */ + int (*uhs2_control)(struct mmc_host *host, enum sd_uhs2_operation op); }; struct mmc_cqe_ops { @@ -379,6 +399,7 @@ struct mmc_host { MMC_CAP2_HS200_1_2V_SDR) #define MMC_CAP2_SD_EXP (1 << 7) /* SD express via PCIe */ #define MMC_CAP2_SD_EXP_1_2V (1 << 8) /* SD express 1.2V */ +#define MMC_CAP2_SD_UHS2 (1 << 9) /* SD UHS-II support */ #define MMC_CAP2_CD_ACTIVE_HIGH (1 << 10) /* Card-detect signal active high */ #define MMC_CAP2_RO_ACTIVE_HIGH (1 << 11) /* Write-protect signal active high */ #define MMC_CAP2_NO_PRESCAN_POWERUP (1 << 14) /* Don't power up before scan */ @@ -405,6 +426,8 @@ struct mmc_host { #endif #define MMC_CAP2_ALT_GPT_TEGRA (1 << 28) /* Host with eMMC that has GPT entry at a non-standard location */ + struct sd_uhs2_caps uhs2_caps; /* Host UHS-II capabilities */ + int fixed_drv_type; /* fixed driver type for non-removable media */ mmc_pm_flag_t pm_caps; /* supported pm features */ -- cgit v1.2.3 From 153196d550c747367bdbec5cd545a572c5310451 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 13 Sep 2024 18:28:17 +0800 Subject: mmc: core: Announce successful insertion of an SD UHS-II card To inform the users about SD UHS-II cards, let's extend the print at card insertion with a "UHS-II" substring. Within this change, it seems reasonable to convert from using "ultra high speed" into "UHS-I speed", for the UHS-I type, as it should makes it more clear. Note that, the new print for UHS-II cards doesn't include the actual selected speed mode. Instead, this is going to be added from subsequent change. Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20240913102836.6144-4-victorshihgli@gmail.com --- include/linux/mmc/host.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e1d380de3aaa..9a148280bf42 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -638,6 +638,14 @@ static inline int mmc_card_uhs(struct mmc_card *card) card->host->ios.timing <= MMC_TIMING_UHS_DDR50; } +static inline bool mmc_card_uhs2(struct mmc_host *host) +{ + return host->ios.timing == MMC_TIMING_UHS2_SPEED_A || + host->ios.timing == MMC_TIMING_UHS2_SPEED_A_HD || + host->ios.timing == MMC_TIMING_UHS2_SPEED_B || + host->ios.timing == MMC_TIMING_UHS2_SPEED_B_HD; +} + void mmc_retune_timer_stop(struct mmc_host *host); static inline void mmc_retune_needed(struct mmc_host *host) -- cgit v1.2.3 From a56ffd3a83ed2e10e0d9e0b199547bfa0d206aac Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 13 Sep 2024 18:28:18 +0800 Subject: mmc: core: Extend support for mmc regulators with a vqmmc2 To allow an additional external regulator to be controlled by an mmc host driver, let's add support for a vqmmc2 regulator to the mmc core. For an SD UHS-II interface the vqmmc2 regulator may correspond to the so called vdd2 supply, as described by the SD spec. Initially, only 1.8V is needed, hence limit the new helper function, mmc_regulator_set_vqmmc2() to this too. Note that, to allow for flexibility mmc host drivers need to manage the enable/disable of the vqmmc2 regulator themselves, while the regulator is looked up through the common mmc_regulator_get_supply(). Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20240913102836.6144-5-victorshihgli@gmail.com --- include/linux/mmc/host.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 9a148280bf42..a7f790c75bc8 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -75,6 +75,9 @@ struct mmc_ios { #define MMC_SIGNAL_VOLTAGE_180 1 #define MMC_SIGNAL_VOLTAGE_120 2 + unsigned char vqmmc2_voltage; +#define MMC_VQMMC2_VOLTAGE_180 0 + unsigned char drv_type; /* driver type (A, B, C, D) */ #define MMC_SET_DRIVER_TYPE_B 0 @@ -308,6 +311,7 @@ struct mmc_pwrseq; struct mmc_supply { struct regulator *vmmc; /* Card power supply */ struct regulator *vqmmc; /* Optional Vccq supply */ + struct regulator *vqmmc2; /* Optional supply for phy */ }; struct mmc_ctx { @@ -590,6 +594,7 @@ int mmc_regulator_set_ocr(struct mmc_host *mmc, struct regulator *supply, unsigned short vdd_bit); int mmc_regulator_set_vqmmc(struct mmc_host *mmc, struct mmc_ios *ios); +int mmc_regulator_set_vqmmc2(struct mmc_host *mmc, struct mmc_ios *ios); #else static inline int mmc_regulator_set_ocr(struct mmc_host *mmc, struct regulator *supply, @@ -603,6 +608,12 @@ static inline int mmc_regulator_set_vqmmc(struct mmc_host *mmc, { return -EINVAL; } + +static inline int mmc_regulator_set_vqmmc2(struct mmc_host *mmc, + struct mmc_ios *ios) +{ + return -EINVAL; +} #endif int mmc_regulator_get_supply(struct mmc_host *mmc); -- cgit v1.2.3 From a9a75f9dc23c1562dcb261c0a8f3d6fc70d246cd Mon Sep 17 00:00:00 2001 From: Victor Shih Date: Fri, 13 Sep 2024 18:28:19 +0800 Subject: mmc: core: Add definitions for SD UHS-II cards Add UHS-II specific data structures for commands and defines for registers, as described in Part 1 UHS-II Addendum Version 1.01. UHS-II related definitions are listed below: 1. UHS-II card capability: sd_uhs2_caps{} 2. UHS-II configuration: sd_uhs2_config{} 3. UHS-II register I/O address and register field definitions: sd_uhs2.h Signed-off-by: Jason Lai Signed-off-by: Victor Shih Link: https://lore.kernel.org/r/20240913102836.6144-6-victorshihgli@gmail.com Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 31 +++++- include/linux/mmc/host.h | 25 ++++- include/linux/mmc/sd_uhs2.h | 240 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 294 insertions(+), 2 deletions(-) create mode 100644 include/linux/mmc/sd_uhs2.h (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 0bb1ad1ea4dc..526fce581657 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -188,6 +188,12 @@ struct sd_switch_caps { #define SD_MAX_CURRENT_400 (1 << SD_SET_CURRENT_LIMIT_400) #define SD_MAX_CURRENT_600 (1 << SD_SET_CURRENT_LIMIT_600) #define SD_MAX_CURRENT_800 (1 << SD_SET_CURRENT_LIMIT_800) + +#define SD4_SET_POWER_LIMIT_0_72W 0 +#define SD4_SET_POWER_LIMIT_1_44W 1 +#define SD4_SET_POWER_LIMIT_2_16W 2 +#define SD4_SET_POWER_LIMIT_2_88W 3 +#define SD4_SET_POWER_LIMIT_1_80W 4 }; struct sd_ext_reg { @@ -211,7 +217,30 @@ struct sd_ext_reg { struct sd_uhs2_config { u32 node_id; - /* TODO: Extend with more register configs. */ + + u32 n_fcu; + u32 maxblk_len; + u8 n_lanes; + u8 dadr_len; + u8 app_type; + u8 phy_minor_rev; + u8 phy_major_rev; + u8 can_hibernate; + u8 n_lss_sync; + u8 n_lss_dir; + u8 link_minor_rev; + u8 link_major_rev; + u8 dev_type; + u8 n_data_gap; + + u32 n_fcu_set; + u32 maxblk_len_set; + u8 n_lanes_set; + u8 speed_range_set; + u8 n_lss_sync_set; + u8 n_lss_dir_set; + u8 n_data_gap_set; + u8 max_retry_set; }; struct sdio_cccr { diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index a7f790c75bc8..0980d06ed419 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -17,6 +17,7 @@ #include #include #include +#include struct mmc_ios { unsigned int clock; /* clock rate */ @@ -100,7 +101,29 @@ struct mmc_clk_phase_map { }; struct sd_uhs2_caps { - /* TODO: Add UHS-II capabilities for the host. */ + u32 dap; + u32 gap; + u32 group_desc; + u32 maxblk_len; + u32 n_fcu; + u8 n_lanes; + u8 addr64; + u8 card_type; + u8 phy_rev; + u8 speed_range; + u8 n_lss_sync; + u8 n_lss_dir; + u8 link_rev; + u8 host_type; + u8 n_data_gap; + + u32 maxblk_len_set; + u32 n_fcu_set; + u8 n_lanes_set; + u8 n_lss_sync_set; + u8 n_lss_dir_set; + u8 n_data_gap_set; + u8 max_retry_set; }; enum sd_uhs2_operation { diff --git a/include/linux/mmc/sd_uhs2.h b/include/linux/mmc/sd_uhs2.h new file mode 100644 index 000000000000..7abe9bd870c7 --- /dev/null +++ b/include/linux/mmc/sd_uhs2.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Header file for UHS-II packets, Host Controller registers and I/O + * accessors. + * + * Copyright (C) 2014 Intel Corp, All Rights Reserved. + */ +#ifndef LINUX_MMC_UHS2_H +#define LINUX_MMC_UHS2_H + +/* LINK Layer definition */ +/* + * UHS2 Header: + * Refer to UHS-II Addendum Version 1.02 Figure 5-2, the format of CCMD Header is described below: + * bit [3:0] : DID(Destination ID = Node ID of UHS2 card) + * bit [6:4] : TYP(Packet Type) + * 000b: CCMD(Control command packet) + * 001b: DCMD(Data command packet) + * 010b: RES(Response packet) + * 011b: DATA(Data payload packet) + * 111b: MSG(Message packet) + * Others: Reserved + * bit [7] : NP(Native Packet) + * bit [10:8] : TID(Transaction ID) + * bit [11] : Reserved + * bit [15:12]: SID(Source ID 0: Node ID of Host) + * + * Broadcast CCMD issued by Host is represented as DID=SID=0. + */ +/* + * UHS2 Argument: + * Refer to UHS-II Addendum Version 1.02 Figure 6-5, the format of CCMD Argument is described below: + * bit [3:0] : MSB of IOADR + * bit [5:4] : PLEN(Payload Length) + * 00b: 0 byte + * 01b: 4 bytes + * 10b: 8 bytes + * 11b: 16 bytes + * bit [6] : Reserved + * bit [7] : R/W(Read/Write) + * 0: Control read command + * 1: Control write command + * bit [15:8] : LSB of IOADR + * + * I/O Address specifies the address of register in UHS-II I/O space accessed by CCMD. + * The unit of I/O Address is 4 Bytes. It is transmitted in MSB first, LSB last. + */ +#define UHS2_NATIVE_PACKET_POS 7 +#define UHS2_NATIVE_PACKET (1 << UHS2_NATIVE_PACKET_POS) + +#define UHS2_PACKET_TYPE_POS 4 +#define UHS2_PACKET_TYPE_CCMD (0 << UHS2_PACKET_TYPE_POS) +#define UHS2_PACKET_TYPE_DCMD (1 << UHS2_PACKET_TYPE_POS) +#define UHS2_PACKET_TYPE_RES (2 << UHS2_PACKET_TYPE_POS) +#define UHS2_PACKET_TYPE_DATA (3 << UHS2_PACKET_TYPE_POS) +#define UHS2_PACKET_TYPE_MSG (7 << UHS2_PACKET_TYPE_POS) + +#define UHS2_DEST_ID_MASK 0x0F +#define UHS2_DEST_ID 0x1 + +#define UHS2_SRC_ID_POS 12 +#define UHS2_SRC_ID_MASK 0xF000 + +#define UHS2_TRANS_ID_POS 8 +#define UHS2_TRANS_ID_MASK 0x0700 + +/* UHS2 MSG */ +#define UHS2_MSG_CTG_POS 5 +#define UHS2_MSG_CTG_LMSG 0x00 +#define UHS2_MSG_CTG_INT 0x60 +#define UHS2_MSG_CTG_AMSG 0x80 + +#define UHS2_MSG_CTG_FCREQ 0x00 +#define UHS2_MSG_CTG_FCRDY 0x01 +#define UHS2_MSG_CTG_STAT 0x02 + +#define UHS2_MSG_CODE_POS 8 +#define UHS2_MSG_CODE_FC_UNRECOVER_ERR 0x8 +#define UHS2_MSG_CODE_STAT_UNRECOVER_ERR 0x8 +#define UHS2_MSG_CODE_STAT_RECOVER_ERR 0x1 + +/* TRANS Layer definition */ + +/* Native packets*/ +#define UHS2_NATIVE_CMD_RW_POS 7 +#define UHS2_NATIVE_CMD_WRITE (1 << UHS2_NATIVE_CMD_RW_POS) +#define UHS2_NATIVE_CMD_READ (0 << UHS2_NATIVE_CMD_RW_POS) + +#define UHS2_NATIVE_CMD_PLEN_POS 4 +#define UHS2_NATIVE_CMD_PLEN_4B (1 << UHS2_NATIVE_CMD_PLEN_POS) +#define UHS2_NATIVE_CMD_PLEN_8B (2 << UHS2_NATIVE_CMD_PLEN_POS) +#define UHS2_NATIVE_CMD_PLEN_16B (3 << UHS2_NATIVE_CMD_PLEN_POS) + +#define UHS2_NATIVE_CCMD_GET_MIOADR_MASK 0xF00 +#define UHS2_NATIVE_CCMD_MIOADR_MASK 0x0F + +#define UHS2_NATIVE_CCMD_LIOADR_POS 8 +#define UHS2_NATIVE_CCMD_GET_LIOADR_MASK 0x0FF + +#define UHS2_CCMD_DEV_INIT_COMPLETE_FLAG BIT(11) +#define UHS2_DEV_INIT_PAYLOAD_LEN 1 +#define UHS2_DEV_INIT_RESP_LEN 6 +#define UHS2_DEV_ENUM_PAYLOAD_LEN 1 +#define UHS2_DEV_ENUM_RESP_LEN 8 +#define UHS2_CFG_WRITE_PAYLOAD_LEN 2 +#define UHS2_CFG_WRITE_PHY_SET_RESP_LEN 4 +#define UHS2_CFG_WRITE_GENERIC_SET_RESP_LEN 5 +#define UHS2_GO_DORMANT_PAYLOAD_LEN 1 + +/* + * UHS2 Argument: + * Refer to UHS-II Addendum Version 1.02 Figure 6-8, the format of DCMD Argument is described below: + * bit [3:0] : Reserved + * bit [6:3] : TMODE(Transfer Mode) + * bit 3: DAM(Data Access Mode) + * bit 4: TLUM(TLEN Unit Mode) + * bit 5: LM(Length Mode) + * bit 6: DM(Duplex Mode) + * bit [7] : R/W(Read/Write) + * 0: Control read command + * 1: Control write command + * bit [15:8] : Reserved + * + * I/O Address specifies the address of register in UHS-II I/O space accessed by CCMD. + * The unit of I/O Address is 4 Bytes. It is transmitted in MSB first, LSB last. + */ +#define UHS2_DCMD_DM_POS 6 +#define UHS2_DCMD_2L_HD_MODE (1 << UHS2_DCMD_DM_POS) +#define UHS2_DCMD_LM_POS 5 +#define UHS2_DCMD_LM_TLEN_EXIST (1 << UHS2_DCMD_LM_POS) +#define UHS2_DCMD_TLUM_POS 4 +#define UHS2_DCMD_TLUM_BYTE_MODE (1 << UHS2_DCMD_TLUM_POS) +#define UHS2_NATIVE_DCMD_DAM_POS 3 +#define UHS2_NATIVE_DCMD_DAM_IO (1 << UHS2_NATIVE_DCMD_DAM_POS) + +#define UHS2_RES_NACK_POS 7 +#define UHS2_RES_NACK_MASK (0x1 << UHS2_RES_NACK_POS) + +#define UHS2_RES_ECODE_POS 4 +#define UHS2_RES_ECODE_MASK 0x7 +#define UHS2_RES_ECODE_COND 1 +#define UHS2_RES_ECODE_ARG 2 +#define UHS2_RES_ECODE_GEN 3 + +/* IOADR of device registers */ +#define UHS2_IOADR_GENERIC_CAPS 0x00 +#define UHS2_IOADR_PHY_CAPS 0x02 +#define UHS2_IOADR_LINK_CAPS 0x04 +#define UHS2_IOADR_RSV_CAPS 0x06 +#define UHS2_IOADR_GENERIC_SETTINGS 0x08 +#define UHS2_IOADR_PHY_SETTINGS 0x0A +#define UHS2_IOADR_LINK_SETTINGS 0x0C +#define UHS2_IOADR_PRESET 0x40 + +/* SD application packets */ +#define UHS2_SD_CMD_INDEX_POS 8 + +#define UHS2_SD_CMD_APP_POS 14 +#define UHS2_SD_CMD_APP (1 << UHS2_SD_CMD_APP_POS) + +/* UHS-II Device Registers */ +#define UHS2_DEV_CONFIG_REG 0x000 + +/* General Caps and Settings registers */ +#define UHS2_DEV_CONFIG_GEN_CAPS (UHS2_DEV_CONFIG_REG + 0x000) +#define UHS2_DEV_CONFIG_N_LANES_POS 8 +#define UHS2_DEV_CONFIG_N_LANES_MASK 0x3F +#define UHS2_DEV_CONFIG_2L_HD_FD 0x1 +#define UHS2_DEV_CONFIG_2D1U_FD 0x2 +#define UHS2_DEV_CONFIG_1D2U_FD 0x4 +#define UHS2_DEV_CONFIG_2D2U_FD 0x8 +#define UHS2_DEV_CONFIG_DADR_POS 14 +#define UHS2_DEV_CONFIG_DADR_MASK 0x1 +#define UHS2_DEV_CONFIG_APP_POS 16 +#define UHS2_DEV_CONFIG_APP_MASK 0xFF +#define UHS2_DEV_CONFIG_APP_SD_MEM 0x1 + +#define UHS2_DEV_CONFIG_GEN_SET (UHS2_DEV_CONFIG_REG + 0x008) +#define UHS2_DEV_CONFIG_GEN_SET_N_LANES_POS 8 +#define UHS2_DEV_CONFIG_GEN_SET_2L_FD_HD 0x0 +#define UHS2_DEV_CONFIG_GEN_SET_2D1U_FD 0x2 +#define UHS2_DEV_CONFIG_GEN_SET_1D2U_FD 0x3 +#define UHS2_DEV_CONFIG_GEN_SET_2D2U_FD 0x4 +#define UHS2_DEV_CONFIG_GEN_SET_CFG_COMPLETE BIT(31) + +/* PHY Caps and Settings registers */ +#define UHS2_DEV_CONFIG_PHY_CAPS (UHS2_DEV_CONFIG_REG + 0x002) +#define UHS2_DEV_CONFIG_PHY_MINOR_MASK 0xF +#define UHS2_DEV_CONFIG_PHY_MAJOR_POS 4 +#define UHS2_DEV_CONFIG_PHY_MAJOR_MASK 0x3 +#define UHS2_DEV_CONFIG_CAN_HIBER_POS 15 +#define UHS2_DEV_CONFIG_CAN_HIBER_MASK 0x1 +#define UHS2_DEV_CONFIG_PHY_CAPS1 (UHS2_DEV_CONFIG_REG + 0x003) +#define UHS2_DEV_CONFIG_N_LSS_SYN_MASK 0xF +#define UHS2_DEV_CONFIG_N_LSS_DIR_POS 4 +#define UHS2_DEV_CONFIG_N_LSS_DIR_MASK 0xF + +#define UHS2_DEV_CONFIG_PHY_SET (UHS2_DEV_CONFIG_REG + 0x00A) +#define UHS2_DEV_CONFIG_PHY_SET_SPEED_POS 6 +#define UHS2_DEV_CONFIG_PHY_SET_SPEED_A 0x0 +#define UHS2_DEV_CONFIG_PHY_SET_SPEED_B 0x1 + +/* LINK-TRAN Caps and Settings registers */ +#define UHS2_DEV_CONFIG_LINK_TRAN_CAPS (UHS2_DEV_CONFIG_REG + 0x004) +#define UHS2_DEV_CONFIG_LT_MINOR_MASK 0xF +#define UHS2_DEV_CONFIG_LT_MAJOR_POS 4 +#define UHS2_DEV_CONFIG_LT_MAJOR_MASK 0x3 +#define UHS2_DEV_CONFIG_N_FCU_POS 8 +#define UHS2_DEV_CONFIG_N_FCU_MASK 0xFF +#define UHS2_DEV_CONFIG_DEV_TYPE_POS 16 +#define UHS2_DEV_CONFIG_DEV_TYPE_MASK 0x7 +#define UHS2_DEV_CONFIG_MAX_BLK_LEN_POS 20 +#define UHS2_DEV_CONFIG_MAX_BLK_LEN_MASK 0xFFF +#define UHS2_DEV_CONFIG_LINK_TRAN_CAPS1 (UHS2_DEV_CONFIG_REG + 0x005) +#define UHS2_DEV_CONFIG_N_DATA_GAP_MASK 0xFF + +#define UHS2_DEV_CONFIG_LINK_TRAN_SET (UHS2_DEV_CONFIG_REG + 0x00C) +#define UHS2_DEV_CONFIG_LT_SET_MAX_BLK_LEN 0x200 +#define UHS2_DEV_CONFIG_LT_SET_MAX_RETRY_POS 16 + +/* Preset register */ +#define UHS2_DEV_CONFIG_PRESET (UHS2_DEV_CONFIG_REG + 0x040) + +#define UHS2_DEV_INT_REG 0x100 + +#define UHS2_DEV_STATUS_REG 0x180 + +#define UHS2_DEV_CMD_REG 0x200 +#define UHS2_DEV_CMD_FULL_RESET (UHS2_DEV_CMD_REG + 0x000) +#define UHS2_DEV_CMD_GO_DORMANT_STATE (UHS2_DEV_CMD_REG + 0x001) +#define UHS2_DEV_CMD_DORMANT_HIBER BIT(7) +#define UHS2_DEV_CMD_DEVICE_INIT (UHS2_DEV_CMD_REG + 0x002) +#define UHS2_DEV_INIT_COMPLETE_FLAG BIT(11) +#define UHS2_DEV_CMD_ENUMERATE (UHS2_DEV_CMD_REG + 0x003) +#define UHS2_DEV_CMD_TRANS_ABORT (UHS2_DEV_CMD_REG + 0x004) + +#define UHS2_RCLK_MAX 52000000 +#define UHS2_RCLK_MIN 26000000 + +#endif /* LINUX_MMC_UHS2_H */ -- cgit v1.2.3 From 5bb798cfdfd00182065cf55c1ff4b2c08d3be13f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 28 Sep 2024 18:20:56 +0200 Subject: memstick: Constify struct memstick_device_id 'struct memstick_device_id' are not modified in these drivers. Constifying this structure moves some data to a read-only section, so increases overall security. Update memstick_dev_match(), memstick_bus_match() and struct memstick_driver accordingly. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 74055 3455 88 77598 12f1e drivers/memstick/core/ms_block.o After: ===== text data bss dec hex filename 74087 3423 88 77598 12f1e drivers/memstick/core/ms_block.o Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/6509d6f6ed64193f04e747a98ccea7492c976ca8.1727540434.git.christophe.jaillet@wanadoo.fr Signed-off-by: Ulf Hansson --- include/linux/memstick.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memstick.h b/include/linux/memstick.h index ebf73d4ee969..107bdcbedf79 100644 --- a/include/linux/memstick.h +++ b/include/linux/memstick.h @@ -293,7 +293,7 @@ struct memstick_host { }; struct memstick_driver { - struct memstick_device_id *id_table; + const struct memstick_device_id *id_table; int (*probe)(struct memstick_dev *card); void (*remove)(struct memstick_dev *card); int (*suspend)(struct memstick_dev *card, -- cgit v1.2.3 From 7e019dcc470f27066c98697e43d930df8d54bd9c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 9 Oct 2024 09:50:07 -0400 Subject: sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads commit 223baf9d17f25 ("sched: Fix performance regression introduced by mm_cid") introduced a per-mm/cpu current concurrency id (mm_cid), which keeps a reference to the concurrency id allocated for each CPU. This reference expires shortly after a 100ms delay. These per-CPU references keep the per-mm-cid data cache-local in situations where threads are running at least once on each CPU within each 100ms window, thus keeping the per-cpu reference alive. However, intermittent workloads behaving in bursts spaced by more than 100ms on each CPU exhibit bad cache locality and degraded performance compared to purely per-cpu data indexing, because concurrency IDs are allocated over various CPUs and cores, therefore losing cache locality of the associated data. Introduce the following changes to improve per-mm-cid cache locality: - Add a "recent_cid" field to the per-mm/cpu mm_cid structure to keep track of which mm_cid value was last used, and use it as a hint to attempt re-allocating the same concurrency ID the next time this mm/cpu needs to allocate a concurrency ID, - Add a per-mm CPUs allowed mask, which keeps track of the union of CPUs allowed for all threads belonging to this mm. This cpumask is only set during the lifetime of the mm, never cleared, so it represents the union of all the CPUs allowed since the beginning of the mm lifetime (note that the mm_cpumask() is really arch-specific and tailored to the TLB flush needs, and is thus _not_ a viable approach for this), - Add a per-mm nr_cpus_allowed to keep track of the weight of the per-mm CPUs allowed mask (for fast access), - Add a per-mm max_nr_cid to keep track of the highest number of concurrency IDs allocated for the mm. This is used for expanding the concurrency ID allocation within the upper bound defined by: min(mm->nr_cpus_allowed, mm->mm_users) When the next unused CID value reaches this threshold, stop trying to expand the cid allocation and use the first available cid value instead. Spreading allocation to use all the cid values within the range [ 0, min(mm->nr_cpus_allowed, mm->mm_users) - 1 ] improves cache locality while preserving mm_cid compactness within the expected user limits, - In __mm_cid_try_get, only return cid values within the range [ 0, mm->nr_cpus_allowed ] rather than [ 0, nr_cpu_ids ]. This prevents allocating cids above the number of allowed cpus in rare scenarios where cid allocation races with a concurrent remote-clear of the per-mm/cpu cid. This improvement is made possible by the addition of the per-mm CPUs allowed mask, - In sched_mm_cid_migrate_to, use mm->nr_cpus_allowed rather than t->nr_cpus_allowed. This criterion was really meant to compare the number of mm->mm_users to the number of CPUs allowed for the entire mm. Therefore, the prior comparison worked fine when all threads shared the same CPUs allowed mask, but not so much in scenarios where those threads have different masks (e.g. each thread pinned to a single CPU). This improvement is made possible by the addition of the per-mm CPUs allowed mask. * Benchmarks Each thread increments 16kB worth of 8-bit integers in bursts, with a configurable delay between each thread's execution. Each thread run one after the other (no threads run concurrently). The order of thread execution in the sequence is random. The thread execution sequence begins again after all threads have executed. The 16kB areas are allocated with rseq_mempool and indexed by either cpu_id, mm_cid (not cache-local), or cache-local mm_cid. Each thread is pinned to its own core. Testing configurations: 8-core/1-L3: Use 8 cores within a single L3 24-core/24-L3: Use 24 cores, 1 core per L3 192-core/24-L3: Use 192 cores (all cores in the system) 384-thread/24-L3: Use 384 HW threads (all HW threads in the system) Intermittent workload delays between threads: 200ms, 10ms. Hardware: CPU(s): 384 On-line CPU(s) list: 0-383 Vendor ID: AuthenticAMD Model name: AMD EPYC 9654 96-Core Processor Thread(s) per core: 2 Core(s) per socket: 96 Socket(s): 2 Caches (sum of all): L1d: 6 MiB (192 instances) L1i: 6 MiB (192 instances) L2: 192 MiB (192 instances) L3: 768 MiB (24 instances) Each result is an average of 5 test runs. The cache-local speedup is calculated as: (cache-local mm_cid) / (mm_cid). Intermittent workload delay: 200ms per-cpu mm_cid cache-local mm_cid cache-local speedup (ns) (ns) (ns) 8-core/1-L3 1374 19289 1336 14.4x 24-core/24-L3 2423 26721 1594 16.7x 192-core/24-L3 2291 15826 2153 7.3x 384-thread/24-L3 1874 13234 1907 6.9x Intermittent workload delay: 10ms per-cpu mm_cid cache-local mm_cid cache-local speedup (ns) (ns) (ns) 8-core/1-L3 662 756 686 1.1x 24-core/24-L3 1378 3648 1035 3.5x 192-core/24-L3 1439 10833 1482 7.3x 384-thread/24-L3 1503 10570 1556 6.8x [ This deprecates the prior "sched: NUMA-aware per-memory-map concurrency IDs" patch series with a simpler and more general approach. ] [ This patch applies on top of v6.12-rc1. ] Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://lore.kernel.org/lkml/20240823185946.418340-1-mathieu.desnoyers@efficios.com/ --- include/linux/mm_types.h | 72 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..381d22eba088 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -782,6 +782,7 @@ struct vm_area_struct { struct mm_cid { u64 time; int cid; + int recent_cid; }; #endif @@ -852,6 +853,27 @@ struct mm_struct { * When the next mm_cid scan is due (in jiffies). */ unsigned long mm_cid_next_scan; + /** + * @nr_cpus_allowed: Number of CPUs allowed for mm. + * + * Number of CPUs allowed in the union of all mm's + * threads allowed CPUs. + */ + unsigned int nr_cpus_allowed; + /** + * @max_nr_cid: Maximum number of concurrency IDs allocated. + * + * Track the highest number of concurrency IDs allocated for the + * mm. + */ + atomic_t max_nr_cid; + /** + * @cpus_allowed_lock: Lock protecting mm cpus_allowed. + * + * Provide mutual exclusion for mm cpus_allowed and + * mm nr_cpus_allowed updates. + */ + raw_spinlock_t cpus_allowed_lock; #endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ @@ -1170,18 +1192,30 @@ static inline int mm_cid_clear_lazy_put(int cid) return cid & ~MM_CID_LAZY_PUT; } +/* + * mm_cpus_allowed: Union of all mm's threads allowed CPUs. + */ +static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) +{ + unsigned long bitmap = (unsigned long)mm; + + bitmap += offsetof(struct mm_struct, cpu_bitmap); + /* Skip cpu_bitmap */ + bitmap += cpumask_size(); + return (struct cpumask *)bitmap; +} + /* Accessor for struct mm_struct's cidmask. */ static inline cpumask_t *mm_cidmask(struct mm_struct *mm) { - unsigned long cid_bitmap = (unsigned long)mm; + unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); - cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); - /* Skip cpu_bitmap */ + /* Skip mm_cpus_allowed */ cid_bitmap += cpumask_size(); return (struct cpumask *)cid_bitmap; } -static inline void mm_init_cid(struct mm_struct *mm) +static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { int i; @@ -1189,17 +1223,22 @@ static inline void mm_init_cid(struct mm_struct *mm) struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); pcpu_cid->cid = MM_CID_UNSET; + pcpu_cid->recent_cid = MM_CID_UNSET; pcpu_cid->time = 0; } + mm->nr_cpus_allowed = p->nr_cpus_allowed; + atomic_set(&mm->max_nr_cid, 0); + raw_spin_lock_init(&mm->cpus_allowed_lock); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); cpumask_clear(mm_cidmask(mm)); } -static inline int mm_alloc_cid_noprof(struct mm_struct *mm) +static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); if (!mm->pcpu_cid) return -ENOMEM; - mm_init_cid(mm); + mm_init_cid(mm, p); return 0; } #define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) @@ -1212,16 +1251,31 @@ static inline void mm_destroy_cid(struct mm_struct *mm) static inline unsigned int mm_cid_size(void) { - return cpumask_size(); + return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ +} + +static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) +{ + struct cpumask *mm_allowed = mm_cpus_allowed(mm); + + if (!mm) + return; + /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ + raw_spin_lock(&mm->cpus_allowed_lock); + cpumask_or(mm_allowed, mm_allowed, cpumask); + WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); + raw_spin_unlock(&mm->cpus_allowed_lock); } #else /* CONFIG_SCHED_MM_CID */ -static inline void mm_init_cid(struct mm_struct *mm) { } -static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } +static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } +static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } + static inline unsigned int mm_cid_size(void) { return 0; } +static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; -- cgit v1.2.3 From c2f8fde8689272a55b9319b69dfe7e8f0e2e9dfe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Oct 2024 11:40:56 +0200 Subject: fs: add helper to use mount option as path or fd Allow filesystems to use a mount option either as a file or path. Link: https://lore.kernel.org/r/20241014-work-overlayfs-v3-1-32b3fed1286e@kernel.org Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/linux/fs_parser.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 6cf713a7e6c6..3cef566088fc 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -28,7 +28,8 @@ typedef int fs_param_type(struct p_log *, */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev, - fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid; + fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, + fs_param_is_file_or_string; /* * Specification of the type of value a parameter wants. @@ -133,6 +134,8 @@ static inline bool fs_validate_description(const char *name, #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) #define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) +#define fsparam_file_or_string(NAME, OPT) \ + __fsparam(fs_param_is_file_or_string, NAME, OPT, 0, NULL) #define fsparam_uid(NAME, OPT) __fsparam(fs_param_is_uid, NAME, OPT, 0, NULL) #define fsparam_gid(NAME, OPT) __fsparam(fs_param_is_gid, NAME, OPT, 0, NULL) -- cgit v1.2.3 From 6e3ea06240adfef7b46e2338dd824541c31de06d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2024 18:03:23 +0300 Subject: dmaengine: acpi: Drop unused devm_acpi_dma_controller_free() After introduction a few years ago the devm_acpi_dma_controller_free() was never used. Drop it. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241007150436.2183575-2-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/acpi_dma.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi_dma.h b/include/linux/acpi_dma.h index 72cedb916a9c..3ef1ec7a04cb 100644 --- a/include/linux/acpi_dma.h +++ b/include/linux/acpi_dma.h @@ -65,7 +65,6 @@ int devm_acpi_dma_controller_register(struct device *dev, struct dma_chan *(*acpi_dma_xlate) (struct acpi_dma_spec *, struct acpi_dma *), void *data); -void devm_acpi_dma_controller_free(struct device *dev); struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev, size_t index); @@ -94,9 +93,6 @@ static inline int devm_acpi_dma_controller_register(struct device *dev, { return -ENODEV; } -static inline void devm_acpi_dma_controller_free(struct device *dev) -{ -} static inline struct dma_chan *acpi_dma_request_slave_chan_by_index( struct device *dev, size_t index) -- cgit v1.2.3 From 662f045332addc961940e48eb920caa954abbf09 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2024 18:03:25 +0300 Subject: dmaengine: acpi: Clean up headers There is a few things done: - include only the headers we are direct user of - when pointer is in use, provide a forward declaration - add missing headers - sort alphabetically Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241007150436.2183575-4-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/acpi_dma.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi_dma.h b/include/linux/acpi_dma.h index 3ef1ec7a04cb..e748b2877602 100644 --- a/include/linux/acpi_dma.h +++ b/include/linux/acpi_dma.h @@ -11,10 +11,11 @@ #ifndef __LINUX_ACPI_DMA_H #define __LINUX_ACPI_DMA_H -#include -#include #include #include +#include + +struct device; /** * struct acpi_dma_spec - slave device DMA resources -- cgit v1.2.3 From 6ba55951e70bf7a8d89c03aa5612d2e0c81ded6d Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 10 Oct 2024 11:27:15 -0500 Subject: logic_pio: Constify fwnode_handle The fwnode_handle passed into find_io_range_by_fwnode() and logic_pio_trans_hwaddr() are not modified, so make them const. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241010-dt-const-v1-2-87a51f558425@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/logic_pio.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/logic_pio.h b/include/linux/logic_pio.h index babf4e3c28ba..8f1a9408302f 100644 --- a/include/linux/logic_pio.h +++ b/include/linux/logic_pio.h @@ -17,7 +17,7 @@ enum { struct logic_pio_hwaddr { struct list_head list; - struct fwnode_handle *fwnode; + const struct fwnode_handle *fwnode; resource_size_t hw_start; resource_size_t io_start; resource_size_t size; /* range size populated */ @@ -110,8 +110,8 @@ void logic_outsl(unsigned long addr, const void *buffer, unsigned int count); #endif /* CONFIG_INDIRECT_PIO */ #define MMIO_UPPER_LIMIT (IO_SPACE_LIMIT - PIO_INDIRECT_SIZE) -struct logic_pio_hwaddr *find_io_range_by_fwnode(struct fwnode_handle *fwnode); -unsigned long logic_pio_trans_hwaddr(struct fwnode_handle *fwnode, +struct logic_pio_hwaddr *find_io_range_by_fwnode(const struct fwnode_handle *fwnode); +unsigned long logic_pio_trans_hwaddr(const struct fwnode_handle *fwnode, resource_size_t hw_addr, resource_size_t size); int logic_pio_register_range(struct logic_pio_hwaddr *newrange); void logic_pio_unregister_range(struct logic_pio_hwaddr *range); -- cgit v1.2.3 From 78e2baf3d96edd21c6f26d8afc0e68d02ec2c51c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:13 +0000 Subject: net: add TIME_WAIT logic to sk_to_full_sk() TCP will soon attach TIME_WAIT sockets to some ACK and RST. Make sure sk_to_full_sk() detects this and does not return a non full socket. v3: also changed sk_const_to_full_sk() Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Martin KaFai Lau Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index ce91d9b2acb9..f0f219271daf 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \ + if (__sk && __sk == skb_to_full_sk(skb) && \ cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ -- cgit v1.2.3 From f15e3b3ddb9fab1c1731b6154e2cd6573fb54c4d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:56 +0000 Subject: net: napi: Make napi_defer_hard_irqs per-NAPI Add defer_hard_irqs to napi_struct in preparation for per-NAPI settings. The existing sysfs parameter is respected; writes to sysfs will write to all NAPI structs for the device and the net_device defer_hard_irq field. Reads from sysfs show the net_device field. The ability to set defer_hard_irqs on specific NAPI instances will be added in a later commit, via netdev-genl. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e6b93d01e631..2e7bc23660ec 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -373,6 +373,7 @@ struct napi_struct { unsigned int napi_id; struct hrtimer timer; struct task_struct *thread; + u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; @@ -2085,7 +2086,6 @@ struct net_device { unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; unsigned long gro_flush_timeout; - u32 napi_defer_hard_irqs; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; @@ -2413,6 +2413,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + u32 napi_defer_hard_irqs; /** * @lock: protects @net_shaper_hierarchy, feel free to use for other -- cgit v1.2.3 From acb8d4ed5661d05f794ef2ce34fd11e699e9ca32 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:58 +0000 Subject: net: napi: Make gro_flush_timeout per-NAPI Allow per-NAPI gro_flush_timeout setting. The existing sysfs parameter is respected; writes to sysfs will write to all NAPI structs for the device and the net_device gro_flush_timeout field. Reads from sysfs will read from the net_device field. The ability to set gro_flush_timeout on specific NAPI instances will be added in a later commit, via netdev-genl. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-4-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2e7bc23660ec..93241d4de437 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -373,6 +373,7 @@ struct napi_struct { unsigned int napi_id; struct hrtimer timer; struct task_struct *thread; + unsigned long gro_flush_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; @@ -2085,7 +2086,6 @@ struct net_device { int ifindex; unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; - unsigned long gro_flush_timeout; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; @@ -2413,6 +2413,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; /** -- cgit v1.2.3 From 86e25f40aa1e9e54e081e55016f65b5c92523989 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:45:00 +0000 Subject: net: napi: Add napi_config Add a persistent NAPI config area for NAPI configuration to the core. Drivers opt-in to setting the persistent config for a NAPI by passing an index when calling netif_napi_add_config. napi_config is allocated in alloc_netdev_mqs, freed in free_netdev (after the NAPIs are deleted). Drivers which call netif_napi_add_config will have persistent per-NAPI settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings. Per-NAPI settings are saved in napi_disable and restored in napi_enable. Co-developed-by: Martin Karsten Signed-off-by: Martin Karsten Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-6-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93241d4de437..8feaca12655e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -342,6 +342,15 @@ struct gro_list { */ #define GRO_HASH_BUCKETS 8 +/* + * Structure for per-NAPI config + */ +struct napi_config { + u64 gro_flush_timeout; + u32 defer_hard_irqs; + unsigned int napi_id; +}; + /* * Structure for NAPI scheduling similar to tasklet but with weighting */ @@ -379,6 +388,8 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; int irq; + int index; + struct napi_config *config; }; enum { @@ -1868,9 +1879,6 @@ enum netdev_reg_state { * allocated at register_netdev() time * @real_num_rx_queues: Number of RX queues currently active in device * @xdp_prog: XDP sockets filter program pointer - * @gro_flush_timeout: timeout for GRO layer in NAPI - * @napi_defer_hard_irqs: If not zero, provides a counter that would - * allow to avoid NIC hard IRQ, on busy queues. * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one @@ -2020,6 +2028,11 @@ enum netdev_reg_state { * where the clock is recovered. * * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * @napi_config: An array of napi_config structures containing per-NAPI + * settings. + * @gro_flush_timeout: timeout for GRO layer in NAPI + * @napi_defer_hard_irqs: If not zero, provides a counter that would + * allow to avoid NIC hard IRQ, on busy queues. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2413,6 +2426,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + struct napi_config *napi_config; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; @@ -2678,6 +2692,22 @@ netif_napi_add_tx_weight(struct net_device *dev, netif_napi_add_weight(dev, napi, poll, weight); } +/** + * netif_napi_add_config - initialize a NAPI context with persistent config + * @dev: network device + * @napi: NAPI context + * @poll: polling function + * @index: the NAPI index + */ +static inline void +netif_napi_add_config(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int index) +{ + napi->index = index; + napi->config = &dev->napi_config[index]; + netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); +} + /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device -- cgit v1.2.3 From 4971266e1595f76be3f844c834c1f9357a97dbde Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 10 Oct 2024 16:25:03 -0700 Subject: bpf: Add kmem_cache iterator The new "kmem_cache" iterator will traverse the list of slab caches and call attached BPF programs for each entry. It should check the argument (ctx.s) if it's NULL before using it. Now the iteration grabs the slab_mutex only if it traverse the list and releases the mutex when it runs the BPF program. The kmem_cache entry is protected by a refcount during the execution. Signed-off-by: Namhyung Kim Acked-by: Vlastimil Babka #slab Link: https://lore.kernel.org/r/20241010232505.1339892-2-namhyung@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index c0e3e1426a82..139bdececdcf 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -283,5 +283,6 @@ extern u32 btf_tracing_ids[]; extern u32 bpf_cgroup_btf_id[]; extern u32 bpf_local_storage_map_btf_id[]; extern u32 btf_bpf_map_id[]; +extern u32 bpf_kmem_cache_btf_id[]; #endif -- cgit v1.2.3 From 6632863226d88383c9e2bedfeeb928ac7f8232b9 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 9 Oct 2024 13:18:08 +0800 Subject: iommu: Remove iommu_present() The last callsite of iommu_present() is removed by commit <45c690aea8ee> ("drm/tegra: Use iommu_paging_domain_alloc()"). Remove it to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241009051808.29455-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bd722f473635..62d1b85c80d3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -785,7 +785,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) } extern int bus_iommu_probe(const struct bus_type *bus); -extern bool iommu_present(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_group_has_isolated_msi(struct iommu_group *group); extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus); @@ -1081,11 +1080,6 @@ struct iommu_iotlb_gather {}; struct iommu_dirty_bitmap {}; struct iommu_dirty_ops {}; -static inline bool iommu_present(const struct bus_type *bus) -{ - return false; -} - static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) { return false; -- cgit v1.2.3 From a274465cc3bef2dfd9c9ea5100848dda0a8641e1 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Thu, 10 Oct 2024 13:54:19 +0100 Subject: net: phy: support 'active-high' property for PHY LEDs In addition to 'active-low' and 'inactive-high-impedance' also support 'active-high' property for PHY LED pin configuration. As only either 'active-high' or 'active-low' can be set at the same time, WARN and return an error in case both are set. Signed-off-by: Daniel Golle Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/91598487773d768f254d5faf06cf65b13e972f0e.1728558223.git.daniel@makrotopia.org Signed-off-by: Paolo Abeni --- include/linux/phy.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index ff762a3d8270..bf0eb4e5d35c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -877,8 +877,9 @@ struct phy_plca_status { /* Modes for PHY LED configuration */ enum phy_led_modes { - PHY_LED_ACTIVE_LOW = 0, - PHY_LED_INACTIVE_HIGH_IMPEDANCE = 1, + PHY_LED_ACTIVE_HIGH = 0, + PHY_LED_ACTIVE_LOW = 1, + PHY_LED_INACTIVE_HIGH_IMPEDANCE = 2, /* keep it last */ __PHY_LED_MODES_NUM, -- cgit v1.2.3 From f68303cf1cf2fb96e20df5499c194f1fe5bab9e2 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 10 Oct 2024 11:27:14 -0500 Subject: PCI: Constify pci_register_io_range() fwnode_handle pci_register_io_range() does not modify the passed in fwnode_handle, so make it const. Acked-by: Bjorn Helgaas Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241010-dt-const-v1-1-87a51f558425@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/pci.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 573b4c4c2be6..733ff6570e2d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1556,7 +1556,7 @@ int __must_check pci_bus_alloc_resource(struct pci_bus *bus, void *alignf_data); -int pci_register_io_range(struct fwnode_handle *fwnode, phys_addr_t addr, +int pci_register_io_range(const struct fwnode_handle *fwnode, phys_addr_t addr, resource_size_t size); unsigned long pci_address_to_pio(phys_addr_t addr); phys_addr_t pci_pio_to_address(unsigned long pio); @@ -2019,7 +2019,7 @@ static inline int pci_request_regions(struct pci_dev *dev, const char *res_name) { return -EIO; } static inline void pci_release_regions(struct pci_dev *dev) { } -static inline int pci_register_io_range(struct fwnode_handle *fwnode, +static inline int pci_register_io_range(const struct fwnode_handle *fwnode, phys_addr_t addr, resource_size_t size) { return -EINVAL; } -- cgit v1.2.3 From ec8c2329da1a8695b3fc80ba67cad9dc75d9a3ec Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 10 Oct 2024 11:27:16 -0500 Subject: of: Constify struct device_node function arguments Functions which don't change the refcount or otherwise modify struct device_node can make struct device_node const. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241010-dt-const-v1-3-87a51f558425@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 14 +++++++------- include/linux/of_address.h | 4 ++-- include/linux/of_irq.h | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 85b60ac9eec5..7875b308f13c 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -357,7 +357,7 @@ extern struct device_node *of_get_cpu_node(int cpu, unsigned int *thread); extern struct device_node *of_cpu_device_node_get(int cpu); extern int of_cpu_node_to_id(struct device_node *np); extern struct device_node *of_get_next_cpu_node(struct device_node *prev); -extern struct device_node *of_get_cpu_state_node(struct device_node *cpu_node, +extern struct device_node *of_get_cpu_state_node(const struct device_node *cpu_node, int index); extern u64 of_get_cpu_hwid(struct device_node *cpun, unsigned int thread); @@ -395,7 +395,7 @@ extern int of_phandle_iterator_args(struct of_phandle_iterator *it, int size); extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)); -extern int of_alias_get_id(struct device_node *np, const char *stem); +extern int of_alias_get_id(const struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); bool of_machine_compatible_match(const char *const *compats); @@ -446,9 +446,9 @@ const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur, */ const char *of_prop_next_string(struct property *prop, const char *cur); -bool of_console_check(struct device_node *dn, char *name, int index); +bool of_console_check(const struct device_node *dn, char *name, int index); -int of_map_id(struct device_node *np, u32 id, +int of_map_id(const struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out); @@ -871,7 +871,7 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag { } -static inline int of_map_id(struct device_node *np, u32 id, +static inline int of_map_id(const struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out) { @@ -1734,7 +1734,7 @@ struct of_overlay_notify_data { #ifdef CONFIG_OF_OVERLAY int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size, - int *ovcs_id, struct device_node *target_base); + int *ovcs_id, const struct device_node *target_base); int of_overlay_remove(int *ovcs_id); int of_overlay_remove_all(void); @@ -1744,7 +1744,7 @@ int of_overlay_notifier_unregister(struct notifier_block *nb); #else static inline int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size, - int *ovcs_id, struct device_node *target_base) + int *ovcs_id, const struct device_node *target_base) { return -ENOTSUPP; } diff --git a/include/linux/of_address.h b/include/linux/of_address.h index 26a19daf0d09..bd46dbcc6e88 100644 --- a/include/linux/of_address.h +++ b/include/linux/of_address.h @@ -83,8 +83,8 @@ extern struct of_pci_range *of_pci_range_parser_one( struct of_pci_range *range); extern int of_pci_address_to_resource(struct device_node *dev, int bar, struct resource *r); -extern int of_pci_range_to_resource(struct of_pci_range *range, - struct device_node *np, +extern int of_pci_range_to_resource(const struct of_pci_range *range, + const struct device_node *np, struct resource *res); extern int of_range_to_resource(struct device_node *np, int index, struct resource *res); diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index d6d3eae2f145..6337ad4e5fe8 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -48,12 +48,12 @@ extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); extern struct device_node *of_irq_find_parent(struct device_node *child); extern struct irq_domain *of_msi_get_domain(struct device *dev, - struct device_node *np, + const struct device_node *np, enum irq_domain_bus_token token); extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id, u32 bus_token); -extern void of_msi_configure(struct device *dev, struct device_node *np); +extern void of_msi_configure(struct device *dev, const struct device_node *np); u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in); #else static inline void of_irq_init(const struct of_device_id *matches) -- cgit v1.2.3 From 9c63fea9acd077701fd67bbc21b1e77d6b98b7e0 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 10 Oct 2024 11:27:17 -0500 Subject: of: Constify struct property pointers Most accesses to struct property do not modify it, so constify struct property pointers where ever possible in the DT core code. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241010-dt-const-v1-4-87a51f558425@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 7875b308f13c..086a60f3b8a6 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -435,7 +435,7 @@ extern int of_detach_node(struct device_node *); * of_property_for_each_u32(np, "propname", u) * printk("U32 value: %x\n", u); */ -const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur, +const __be32 *of_prop_next_u32(const struct property *prop, const __be32 *cur, u32 *pu); /* * struct property *prop; @@ -444,7 +444,7 @@ const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur, * of_property_for_each_string(np, "propname", prop, s) * printk("String value: %s\n", s); */ -const char *of_prop_next_string(struct property *prop, const char *cur); +const char *of_prop_next_string(const struct property *prop, const char *cur); bool of_console_check(const struct device_node *dn, char *name, int index); @@ -826,13 +826,13 @@ static inline bool of_console_check(const struct device_node *dn, const char *na return false; } -static inline const __be32 *of_prop_next_u32(struct property *prop, +static inline const __be32 *of_prop_next_u32(const struct property *prop, const __be32 *cur, u32 *pu) { return NULL; } -static inline const char *of_prop_next_string(struct property *prop, +static inline const char *of_prop_next_string(const struct property *prop, const char *cur) { return NULL; @@ -899,7 +899,7 @@ static inline const void *of_device_get_match_data(const struct device *dev) #define of_node_cmp(s1, s2) strcasecmp((s1), (s2)) #endif -static inline int of_prop_val_eq(struct property *p1, struct property *p2) +static inline int of_prop_val_eq(const struct property *p1, const struct property *p2) { return p1->length == p2->length && !memcmp(p1->value, p2->value, (size_t)p1->length); @@ -1252,7 +1252,7 @@ static inline int of_property_read_string_index(const struct device_node *np, static inline bool of_property_read_bool(const struct device_node *np, const char *propname) { - struct property *prop = of_find_property(np, propname, NULL); + const struct property *prop = of_find_property(np, propname, NULL); return prop ? true : false; } @@ -1430,7 +1430,7 @@ static inline int of_property_read_s32(const struct device_node *np, err = of_phandle_iterator_next(it)) #define of_property_for_each_u32(np, propname, u) \ - for (struct {struct property *prop; const __be32 *item; } _it = \ + for (struct {const struct property *prop; const __be32 *item; } _it = \ {of_find_property(np, propname, NULL), \ of_prop_next_u32(_it.prop, NULL, &u)}; \ _it.item; \ -- cgit v1.2.3 From d79616b04f0e08178ceb716a5d2ef60ab723d532 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 10 Oct 2024 11:27:20 -0500 Subject: of/address: Constify of_busses[] array and pointers The of_busses array is fixed, so it and all struct of_bus pointers can be const. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241010-dt-const-v1-7-87a51f558425@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/of_address.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_address.h b/include/linux/of_address.h index bd46dbcc6e88..9e034363788a 100644 --- a/include/linux/of_address.h +++ b/include/linux/of_address.h @@ -10,7 +10,7 @@ struct of_bus; struct of_pci_range_parser { struct device_node *node; - struct of_bus *bus; + const struct of_bus *bus; const __be32 *range; const __be32 *end; int na; -- cgit v1.2.3 From 74fe1ad4132234f04fcc75e16600449496a67b5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Oct 2024 18:50:14 +0200 Subject: debugobjects: Prepare for batching Move the debug_obj::object pointer into a union and add a pointer to the last node in a batch. That allows to implement batch processing efficiently by utilizing the stack property of hlist: When the first object of a batch is added to the list, then the batch pointer is set to the hlist node of the object itself. Any subsequent add retrieves the pointer to the last node from the first object in the list and uses that for storing the last node pointer in the newly added object. Add the pointer to the data structure and ensure that all relevant pool sizes are strictly batch sized. The actual batching implementation follows in subsequent changes. Signed-off-by: Thomas Gleixner Reviewed-by: Zhen Lei Link: https://lore.kernel.org/all/20241007164914.139204961@linutronix.de --- include/linux/debugobjects.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 32444686b6ff..8b95545e7924 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -23,13 +23,17 @@ struct debug_obj_descr; * @state: tracked object state * @astate: current active state * @object: pointer to the real object + * @batch_last: pointer to the last hlist node in a batch * @descr: pointer to an object type specific debug description structure */ struct debug_obj { - struct hlist_node node; - enum debug_obj_state state; - unsigned int astate; - void *object; + struct hlist_node node; + enum debug_obj_state state; + unsigned int astate; + union { + void *object; + struct hlist_node *batch_last; + }; const struct debug_obj_descr *descr; }; -- cgit v1.2.3 From e4cf33ca48128d580e25ebe779b7ba7b4b4cf733 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Oct 2024 20:21:14 -0400 Subject: ftrace: Consolidate ftrace_regs accessor functions for archs using pt_regs Most architectures use pt_regs within ftrace_regs making a lot of the accessor functions just calls to the pt_regs internally. Instead of duplication this effort, use a HAVE_ARCH_FTRACE_REGS for architectures that have their own ftrace_regs that is not based on pt_regs and will define all the accessor functions, and for the architectures that just use pt_regs, it will leave it undefined, and the default accessor functions will be used. Note, this will also make it easier to add new accessor functions to ftrace_regs as it will mean having to touch less architectures. Cc: Cc: "x86@kernel.org" Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Will Deacon Cc: Huacai Chen Cc: WANG Xuerui Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Link: https://lore.kernel.org/20241010202114.2289f6fd@gandalf.local.home Acked-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens # s390 Acked-by: Catalin Marinas Acked-by: Michael Ellerman # powerpc Suggested-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 32 +++++++------------------------- include/linux/ftrace_regs.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 25 deletions(-) create mode 100644 include/linux/ftrace_regs.h (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 66f10291a0b2..aa9ddd1e4bb6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -113,6 +113,8 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val #ifdef CONFIG_FUNCTION_TRACER +#include + extern int ftrace_enabled; /** @@ -150,14 +152,11 @@ struct ftrace_regs { #define ftrace_regs_size() sizeof(struct __arch_ftrace_regs) #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS - -struct __arch_ftrace_regs { - struct pt_regs regs; -}; - -struct ftrace_regs; -#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) - +/* + * Architectures that define HAVE_DYNAMIC_FTRACE_WITH_ARGS must define their own + * arch_ftrace_get_regs() where it only returns pt_regs *if* it is fully + * populated. It should return NULL otherwise. + */ static inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) { return &arch_ftrace_regs(fregs)->regs; @@ -191,23 +190,6 @@ static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs) return ftrace_get_regs(fregs) != NULL; } -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS -#define ftrace_regs_get_instruction_pointer(fregs) \ - instruction_pointer(ftrace_get_regs(fregs)) -#define ftrace_regs_get_argument(fregs, n) \ - regs_get_kernel_argument(ftrace_get_regs(fregs), n) -#define ftrace_regs_get_stack_pointer(fregs) \ - kernel_stack_pointer(ftrace_get_regs(fregs)) -#define ftrace_regs_return_value(fregs) \ - regs_return_value(ftrace_get_regs(fregs)) -#define ftrace_regs_set_return_value(fregs, ret) \ - regs_set_return_value(ftrace_get_regs(fregs), ret) -#define ftrace_override_function_with_return(fregs) \ - override_function_with_return(ftrace_get_regs(fregs)) -#define ftrace_regs_query_register_offset(name) \ - regs_query_register_offset(name) -#endif - typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h new file mode 100644 index 000000000000..dea6a0851b74 --- /dev/null +++ b/include/linux/ftrace_regs.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FTRACE_REGS_H +#define _LINUX_FTRACE_REGS_H + +/* + * For archs that just copy pt_regs in ftrace regs, it can use this default. + * If an architecture does not use pt_regs, it must define all the below + * accessor functions. + */ +#ifndef HAVE_ARCH_FTRACE_REGS +struct __arch_ftrace_regs { + struct pt_regs regs; +}; + +#define arch_ftrace_regs(fregs) ((struct __arch_ftrace_regs *)(fregs)) + +struct ftrace_regs; + +#define ftrace_regs_get_instruction_pointer(fregs) \ + instruction_pointer(arch_ftrace_get_regs(fregs)) +#define ftrace_regs_get_argument(fregs, n) \ + regs_get_kernel_argument(arch_ftrace_get_regs(fregs), n) +#define ftrace_regs_get_stack_pointer(fregs) \ + kernel_stack_pointer(arch_ftrace_get_regs(fregs)) +#define ftrace_regs_return_value(fregs) \ + regs_return_value(arch_ftrace_get_regs(fregs)) +#define ftrace_regs_set_return_value(fregs, ret) \ + regs_set_return_value(arch_ftrace_get_regs(fregs), ret) +#define ftrace_override_function_with_return(fregs) \ + override_function_with_return(arch_ftrace_get_regs(fregs)) +#define ftrace_regs_query_register_offset(name) \ + regs_query_register_offset(name) + +#endif /* HAVE_ARCH_FTRACE_REGS */ + +#endif /* _LINUX_FTRACE_REGS_H */ -- cgit v1.2.3 From a066397e8ed1036e8b959050ab6e830ee90d9f58 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 13 Sep 2024 19:19:53 -0400 Subject: tpm: fix unsigned/signed mismatch errors related to __calc_tpm2_event_size __calc_tpm2_event_size returns 0 or a positive length, but return values are often interpreted as ints. Convert everything over to u32 to avoid signed/unsigned logic errors. Signed-off-by: Gregory Price Signed-off-by: Ard Biesheuvel --- include/linux/tpm_eventlog.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 7d68a5cc5881..891368e82558 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -157,7 +157,7 @@ struct tcg_algorithm_info { * Return: size of the event on success, 0 on failure */ -static __always_inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, +static __always_inline u32 __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, struct tcg_pcr_event *event_header, bool do_mapping) { -- cgit v1.2.3 From 58797abed49d6b78c7af99b03b037f20c7ffb203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 5 Oct 2024 12:04:17 +0200 Subject: power: supply: core: constify power_supply_battery_info::resist_table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The power supply core never modifies the resist table. Reflect this in the API, so drivers can mark their static tables as const. Signed-off-by: Thomas Weißschuh Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20241005-power-supply-battery-const-v1-1-c1f721927048@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 910d407ebe63..9253411c105f 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -752,7 +752,7 @@ struct power_supply_battery_info { int temp_max; struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX]; int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX]; - struct power_supply_resistance_temp_table *resist_table; + const struct power_supply_resistance_temp_table *resist_table; int resist_table_size; const struct power_supply_vbat_ri_table *vbat2ri_discharging; int vbat2ri_discharging_size; @@ -805,7 +805,7 @@ power_supply_find_ocv2cap_table(struct power_supply_battery_info *info, extern int power_supply_batinfo_ocv2cap(struct power_supply_battery_info *info, int ocv, int temp); extern int -power_supply_temp2resist_simple(struct power_supply_resistance_temp_table *table, +power_supply_temp2resist_simple(const struct power_supply_resistance_temp_table *table, int table_len, int temp); extern int power_supply_vbat2ri(struct power_supply_battery_info *info, int vbat_uv, bool charging); -- cgit v1.2.3 From ce20d5b9e37099a035ab34d4d3f59e1744756385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 5 Oct 2024 12:04:21 +0200 Subject: power: supply: core: constify power_supply_battery_info::ocv_table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The power supply core never modifies the ocv table. Reflect this in the API, so drivers can mark their static tables as const. Signed-off-by: Thomas Weißschuh Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20241005-power-supply-battery-const-v1-5-c1f721927048@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 9253411c105f..4e29ec39c18f 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -750,7 +750,7 @@ struct power_supply_battery_info { int temp_alert_max; int temp_min; int temp_max; - struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX]; + const struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX]; int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX]; const struct power_supply_resistance_temp_table *resist_table; int resist_table_size; @@ -797,9 +797,9 @@ extern bool power_supply_battery_info_has_prop(struct power_supply_battery_info extern int power_supply_battery_info_get_prop(struct power_supply_battery_info *info, enum power_supply_property psp, union power_supply_propval *val); -extern int power_supply_ocv2cap_simple(struct power_supply_battery_ocv_table *table, +extern int power_supply_ocv2cap_simple(const struct power_supply_battery_ocv_table *table, int table_len, int ocv); -extern struct power_supply_battery_ocv_table * +extern const struct power_supply_battery_ocv_table * power_supply_find_ocv2cap_table(struct power_supply_battery_info *info, int temp, int *table_len); extern int power_supply_batinfo_ocv2cap(struct power_supply_battery_info *info, -- cgit v1.2.3 From 49000fee9e639f62ba1f965ed2ae4c5ad18d19e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 5 Oct 2024 12:05:03 +0200 Subject: power: supply: core: add wakeup source inhibit by power_supply_config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To inhibit wakeup users currently have to use dedicated functions to register the power supply: {,devm_}power_supply_register_no_ws(). This is inconsistent to other runtime settings which can be configured through struct power_supply_config. It's also not obvious what _no_ws() is meant to mean. Extend power_supply_config to also be able to inhibit the wakeup source. Signed-off-by: Thomas Weißschuh Reviewed-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20241005-power-supply-no-wakeup-source-v1-1-1d62bf9bcb1d@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 4e29ec39c18f..9a64043798bd 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -236,6 +236,8 @@ struct power_supply_config { char **supplied_to; size_t num_supplicants; + + bool no_wakeup_source; }; /* Description of power supply */ -- cgit v1.2.3 From 85d319e14f301e1c68131b74c1dceabae73d1e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 5 Oct 2024 12:05:10 +0200 Subject: power: supply: core: remove {,devm_}power_supply_register_no_ws() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The same functionality is available through power_supply_config::no_wakeup_source, which is more idiomatic. All users of the old API have been converted. Also remove the argument "ws" from __power_supply_register(), as it is now always "true". Signed-off-by: Thomas Weißschuh Reviewed-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20241005-power-supply-no-wakeup-source-v1-8-1d62bf9bcb1d@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 9a64043798bd..14eedefb3ac8 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -874,17 +874,9 @@ power_supply_register(struct device *parent, const struct power_supply_desc *desc, const struct power_supply_config *cfg); extern struct power_supply *__must_check -power_supply_register_no_ws(struct device *parent, - const struct power_supply_desc *desc, - const struct power_supply_config *cfg); -extern struct power_supply *__must_check devm_power_supply_register(struct device *parent, const struct power_supply_desc *desc, const struct power_supply_config *cfg); -extern struct power_supply *__must_check -devm_power_supply_register_no_ws(struct device *parent, - const struct power_supply_desc *desc, - const struct power_supply_config *cfg); extern void power_supply_unregister(struct power_supply *psy); extern int power_supply_powers(struct power_supply *psy, struct device *dev); -- cgit v1.2.3 From 0b582611a8f4270fa357a22a546909b2dd5fc5fe Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 15 Oct 2024 10:28:43 +0900 Subject: ftrace: Use arch_ftrace_regs() for ftrace_regs_*() macros Since the arch_ftrace_get_regs(fregs) is only valid when the FL_SAVE_REGS is set, we need to use `&arch_ftrace_regs()->regs` for ftrace_regs_*() APIs because those APIs are for ftrace_regs, not complete pt_regs. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/172895572290.107311.16057631001860177198.stgit@devnote2 Fixes: e4cf33ca4812 ("ftrace: Consolidate ftrace_regs accessor functions for archs using pt_regs") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace_regs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index dea6a0851b74..b78a0a60515b 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -17,17 +17,17 @@ struct __arch_ftrace_regs { struct ftrace_regs; #define ftrace_regs_get_instruction_pointer(fregs) \ - instruction_pointer(arch_ftrace_get_regs(fregs)) + instruction_pointer(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_get_argument(fregs, n) \ - regs_get_kernel_argument(arch_ftrace_get_regs(fregs), n) + regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n) #define ftrace_regs_get_stack_pointer(fregs) \ - kernel_stack_pointer(arch_ftrace_get_regs(fregs)) + kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_return_value(fregs) \ - regs_return_value(arch_ftrace_get_regs(fregs)) + regs_return_value(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_set_return_value(fregs, ret) \ - regs_set_return_value(arch_ftrace_get_regs(fregs), ret) + regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret) #define ftrace_override_function_with_return(fregs) \ - override_function_with_return(arch_ftrace_get_regs(fregs)) + override_function_with_return(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_query_register_offset(name) \ regs_query_register_offset(name) -- cgit v1.2.3 From 2d17932da44fdc1ba835ad05110ab996d2912dbf Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 15 Oct 2024 10:28:53 +0900 Subject: ftrace: Rename ftrace_regs_return_value to ftrace_regs_get_return_value Rename ftrace_regs_return_value to ftrace_regs_get_return_value as same as other ftrace_regs_get/set_* APIs. arm64 and riscv are already using this new name. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Link: https://lore.kernel.org/172895573350.107311.7564634260652361511.stgit@devnote2 Acked-by: Mark Rutland Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index b78a0a60515b..be1ed0c891d0 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -22,7 +22,7 @@ struct ftrace_regs; regs_get_kernel_argument(&arch_ftrace_regs(fregs)->regs, n) #define ftrace_regs_get_stack_pointer(fregs) \ kernel_stack_pointer(&arch_ftrace_regs(fregs)->regs) -#define ftrace_regs_return_value(fregs) \ +#define ftrace_regs_get_return_value(fregs) \ regs_return_value(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_set_return_value(fregs, ret) \ regs_set_return_value(&arch_ftrace_regs(fregs)->regs, ret) -- cgit v1.2.3 From 102f085d84607462234ac60f6027973b45a9bde2 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:21 +0200 Subject: timers: Rename usleep_idle_range() to usleep_range_idle() usleep_idle_range() is a variant of usleep_range(). Both are using usleep_range_state() as a base. To be able to find all the related functions in one go, rename it usleep_idle_range() to usleep_range_idle(). No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: SeongJae Park Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-4-dc8b907cb62f@linutronix.de --- include/linux/delay.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/delay.h b/include/linux/delay.h index ff9cda975e30..2bc586aa2068 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -68,7 +68,7 @@ static inline void usleep_range(unsigned long min, unsigned long max) usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } -static inline void usleep_idle_range(unsigned long min, unsigned long max) +static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } -- cgit v1.2.3 From f36eb171410839325fff9cd9b7b7400f7e606962 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:22 +0200 Subject: timers: Update function descriptions of sleep/delay related functions A lot of commonly used functions for inserting a sleep or delay lack a proper function description. Add function descriptions to all of them to have important information in a central place close to the code. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-5-dc8b907cb62f@linutronix.de --- include/linux/delay.h | 48 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/delay.h b/include/linux/delay.h index 2bc586aa2068..2de509e4adce 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -6,17 +6,7 @@ * Copyright (C) 1993 Linus Torvalds * * Delay routines, using a pre-computed "loops_per_jiffy" value. - * - * Please note that ndelay(), udelay() and mdelay() may return early for - * several reasons: - * 1. computed loops_per_jiffy too low (due to the time taken to - * execute the timer interrupt.) - * 2. cache behaviour affecting the time it takes to execute the - * loop function. - * 3. CPU clock rate changes. - * - * Please see this thread: - * https://lists.openwall.net/linux-kernel/2011/01/09/56 + * Sleep routines using timer list timers or hrtimers. */ #include @@ -35,12 +25,21 @@ extern unsigned long loops_per_jiffy; * The 2nd mdelay() definition ensures GCC will optimize away the * while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G. */ - #ifndef MAX_UDELAY_MS #define MAX_UDELAY_MS 5 #endif #ifndef mdelay +/** + * mdelay - Inserting a delay based on milliseconds with busy waiting + * @n: requested delay in milliseconds + * + * See udelay() for basic information about mdelay() and it's variants. + * + * Please double check, whether mdelay() is the right way to go or whether a + * refactoring of the code is the better variant to be able to use msleep() + * instead. + */ #define mdelay(n) (\ (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) @@ -63,16 +62,41 @@ unsigned long msleep_interruptible(unsigned int msecs); void usleep_range_state(unsigned long min, unsigned long max, unsigned int state); +/** + * usleep_range - Sleep for an approximate time + * @min: Minimum time in microseconds to sleep + * @max: Maximum time in microseconds to sleep + * + * For basic information please refere to usleep_range_state(). + * + * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. + */ static inline void usleep_range(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); } +/** + * usleep_range_idle - Sleep for an approximate time with idle time accounting + * @min: Minimum time in microseconds to sleep + * @max: Maximum time in microseconds to sleep + * + * For basic information please refere to usleep_range_state(). + * + * The sleeping task has the state TASK_IDLE during the sleep to prevent + * contribution to the load avarage. + */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { usleep_range_state(min, max, TASK_IDLE); } +/** + * ssleep - wrapper for seconds around msleep + * @seconds: Requested sleep duration in seconds + * + * Please refere to msleep() for detailed information. + */ static inline void ssleep(unsigned int seconds) { msleep(seconds * 1000); -- cgit v1.2.3 From 82e11e47c1880362e05c065bef7dbe28a749555c Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:24 +0200 Subject: timers: Adjust flseep() to reflect reality fsleep() simply implements the recommendations of the outdated documentation in "Documentation/timers/timers-howto.rst". This should be a user friendly interface to choose always the best timeout function approach: - udelay() for very short sleep durations shorter than 10 microseconds - usleep_range() for sleep durations until 20 milliseconds - msleep() for the others The actual implementation has several problems: - It does not take into account that HZ resolution also has an impact on granularity of jiffies and has also an impact on the granularity of the buckets of timer wheel levels. This means that accuracy for the timeout does not have an upper limit. When executing fsleep(20000) on a HZ=100 system, the possible additional slack will be 50% as the granularity of the buckets in the lowest level is 10 milliseconds. - The upper limit of usleep_range() is twice the requested timeout. When no other interrupts occur in this range, the maximum value is used. This means that the requested sleep length has then an additional delay of 100%. Change the thresholds for the decisions in fsleep() to make sure the maximum slack which is added to the sleep duration is 25%. Note: Outdated documentation will be updated in a followup patch. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-7-dc8b907cb62f@linutronix.de --- include/linux/delay.h | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/delay.h b/include/linux/delay.h index 2de509e4adce..89866bab100d 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -11,6 +11,7 @@ #include #include +#include extern unsigned long loops_per_jiffy; @@ -102,15 +103,35 @@ static inline void ssleep(unsigned int seconds) msleep(seconds * 1000); } -/* see Documentation/timers/timers-howto.rst for the thresholds */ +static const unsigned int max_slack_shift = 2; +#define USLEEP_RANGE_UPPER_BOUND ((TICK_NSEC << max_slack_shift) / NSEC_PER_USEC) + +/** + * fsleep - flexible sleep which autoselects the best mechanism + * @usecs: requested sleep duration in microseconds + * + * flseep() selects the best mechanism that will provide maximum 25% slack + * to the requested sleep duration. Therefore it uses: + * + * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer + * overhead for really short sleep durations. + * * usleep_range() for sleep durations which would lead with the usage of + * msleep() to a slack larger than 25%. This depends on the granularity of + * jiffies. + * * msleep() for all other sleep durations. + * + * Note: When %CONFIG_HIGH_RES_TIMERS is not set, all sleeps are processed with + * the granularity of jiffies and the slack might exceed 25% especially for + * short sleep durations. + */ static inline void fsleep(unsigned long usecs) { if (usecs <= 10) udelay(usecs); - else if (usecs <= 20000) - usleep_range(usecs, 2 * usecs); + else if (usecs < USLEEP_RANGE_UPPER_BOUND) + usleep_range(usecs, usecs + (usecs >> max_slack_shift)); else - msleep(DIV_ROUND_UP(usecs, 1000)); + msleep(DIV_ROUND_UP(usecs, USEC_PER_MSEC)); } #endif /* defined(_LINUX_DELAY_H) */ -- cgit v1.2.3 From 89124747f096fc0fe44be0162c7b4fb3271739e8 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 14 Oct 2024 10:22:29 +0200 Subject: iopoll/regmap/phy/snd: Fix comment referencing outdated timer documentation Function descriptions in iopoll.h, regmap.h, phy.h and sound/soc/sof/ops.h copied all the same outdated documentation about sleep/delay function limitations. In those comments, the generic (and still outdated) timer documentation file is referenced. As proper function descriptions for used delay and sleep functions are in place, simply update the descriptions to reference to them. While at it fix missing colon after "Returns" in function description and move return value description to the end of the function description. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Andrew Lunn # for phy.h Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241014-devel-anna-maria-b4-timers-flseep-v3-12-dc8b907cb62f@linutronix.de --- include/linux/iopoll.h | 52 +++++++++++++++++++++++++------------------------- include/linux/phy.h | 9 +++++---- include/linux/regmap.h | 38 ++++++++++++++++++------------------ 3 files changed, 50 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index 19a7b00baff4..91324c331a4b 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -19,19 +19,19 @@ * @op: accessor function (takes @args as its arguments) * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. * @args: arguments for @op poll * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @args is stored in @val. Must not - * be called from atomic context if sleep_us or timeout_us are used. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. */ #define read_poll_timeout(op, val, cond, sleep_us, timeout_us, \ sleep_before_read, args...) \ @@ -64,22 +64,22 @@ * @op: accessor function (takes @args as its arguments) * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). Should - * be less than ~10us since udelay is used (see - * Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @delay_before_read: if it is true, delay @delay_us before read. * @args: arguments for @op poll * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @args is stored in @val. - * * This macro does not rely on timekeeping. Hence it is safe to call even when * timekeeping is suspended, at the expense of an underestimation of wall clock * time, which is rather minimal with a non-zero delay_us. * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. */ #define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \ delay_before_read, args...) \ @@ -119,17 +119,17 @@ * @addr: Address to poll * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @addr is stored in @val. Must not - * be called from atomic context if sleep_us or timeout_us are used. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @addr is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. */ #define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us) \ read_poll_timeout(op, val, cond, sleep_us, timeout_us, false, addr) @@ -140,16 +140,16 @@ * @addr: Address to poll * @val: Variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). Should - * be less than ~10us since udelay is used (see - * Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either - * case, the last read value at @addr is stored in @val. - * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @addr is stored in @val. */ #define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \ read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, false, addr) diff --git a/include/linux/phy.h b/include/linux/phy.h index a98bc91a0cde..504766d4b2d5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1378,12 +1378,13 @@ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); * @regnum: The register on the MMD to read * @val: Variable to read the register into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. - * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. */ diff --git a/include/linux/regmap.h b/include/linux/regmap.h index f9ccad32fc5c..75f162b60ba1 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -106,17 +106,17 @@ struct reg_sequence { * @addr: Address to poll * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read + * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_read * error return value in case of a error read. In the two former cases, * the last read value at @addr is stored in @val. Must not be called * from atomic context if sleep_us or timeout_us are used. - * - * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. */ #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \ ({ \ @@ -133,20 +133,20 @@ struct reg_sequence { * @addr: Address to poll * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @delay_us: Time to udelay between reads in us (0 tight-loops). - * Should be less than ~10us since udelay is used - * (see Documentation/timers/timers-howto.rst). + * @delay_us: Time to udelay between reads in us (0 tight-loops). Please + * read udelay() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_read - * error return value in case of a error read. In the two former cases, - * the last read value at @addr is stored in @val. - * * This is modelled after the readx_poll_timeout_atomic macros in linux/iopoll.h. * * Note: In general regmap cannot be used in atomic context. If you want to use * this macro then first setup your regmap for atomic use (flat or no cache * and MMIO regmap). + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_read + * error return value in case of a error read. In the two former cases, + * the last read value at @addr is stored in @val. */ #define regmap_read_poll_timeout_atomic(map, addr, val, cond, delay_us, timeout_us) \ ({ \ @@ -177,17 +177,17 @@ struct reg_sequence { * @field: Regmap field to read from * @val: Unsigned integer variable to read the value into * @cond: Break condition (usually involving @val) - * @sleep_us: Maximum time to sleep between reads in us (0 - * tight-loops). Should be less than ~20ms since usleep_range - * is used (see Documentation/timers/timers-howto.rst). + * @sleep_us: Maximum time to sleep between reads in us (0 tight-loops). Please + * read usleep_range() function description for details and + * limitations. * @timeout_us: Timeout in us, 0 means never timeout * - * Returns 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read + * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. + * + * Returns: 0 on success and -ETIMEDOUT upon a timeout or the regmap_field_read * error return value in case of a error read. In the two former cases, * the last read value at @addr is stored in @val. Must not be called * from atomic context if sleep_us or timeout_us are used. - * - * This is modelled after the readx_poll_timeout macros in linux/iopoll.h. */ #define regmap_field_read_poll_timeout(field, val, cond, sleep_us, timeout_us) \ ({ \ -- cgit v1.2.3 From 719258c55f7e9c123ef2a0941c6730b6d43c1bc2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 9 Sep 2024 15:49:41 +0200 Subject: mfd: palmas: Constify strings with regulator names The names of regulators are static const strings, so pointers can be made as pointers to const for code safety. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240909134941.121847-1-krzysztof.kozlowski@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/palmas.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h index eda1ffd99c1a..dabcc0dea802 100644 --- a/include/linux/mfd/palmas.h +++ b/include/linux/mfd/palmas.h @@ -98,8 +98,8 @@ struct palmas_sleep_requestor_info { }; struct palmas_regs_info { - char *name; - char *sname; + const char *name; + const char *sname; u8 vsel_addr; u8 ctrl_addr; u8 tstep_addr; -- cgit v1.2.3 From d5340a18cd321075442677ce1002d8e163b17b67 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 26 Sep 2024 13:28:10 +0300 Subject: mfd: max77693: Remove unused max77693_irq_source declarations Remove `enum max77693_irq_source` declaration because unused. Signed-off-by: Dzmitry Sankouski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240617-starqltechn_integration_upstream-v5-1-125d9228d751@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/max77693-private.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h index 20c5e02ed9da..c324d548619e 100644 --- a/include/linux/mfd/max77693-private.h +++ b/include/linux/mfd/max77693-private.h @@ -419,17 +419,6 @@ enum max77693_haptic_reg { #define MAX77693_CONFIG2_MEN 6 #define MAX77693_CONFIG2_HTYP 5 -enum max77693_irq_source { - LED_INT = 0, - TOPSYS_INT, - CHG_INT, - MUIC_INT1, - MUIC_INT2, - MUIC_INT3, - - MAX77693_IRQ_GROUP_NR, -}; - #define SRC_IRQ_CHARGER BIT(0) #define SRC_IRQ_TOP BIT(1) #define SRC_IRQ_FLASH BIT(2) -- cgit v1.2.3 From bf231e5febcf9358d7e70a2c6974548f7f3e4f61 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 26 Sep 2024 12:47:31 +0300 Subject: mfd: sec-core: Add support for the Samsung s2dos05 S2DOS05 is a panel/touchscreen PMIC, often found in Samsung phones. We define regulator sub-device for which driver will be added in subsequent patch. The device also has ADC for power and current measurements. Signed-off-by: Dzmitry Sankouski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240617-starqltechn_integration_upstream-v5-2-ea1109029ba5@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/samsung/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h index a212b9f72bc9..750274d41fc0 100644 --- a/include/linux/mfd/samsung/core.h +++ b/include/linux/mfd/samsung/core.h @@ -37,6 +37,7 @@ struct gpio_desc; enum sec_device_type { S5M8767X, + S2DOS05, S2MPA01, S2MPS11X, S2MPS13X, -- cgit v1.2.3 From 2b627246c0cf2b54f1b17531dde964cf5b99b2a5 Mon Sep 17 00:00:00 2001 From: Pavan Holla Date: Tue, 10 Sep 2024 10:15:21 +0000 Subject: platform/chrome: Update EC feature flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define EC_FEATURE_UCSI_PPM to enable usage of the cros_ec_ucsi driver. Also, add any feature flags that are implemented by the EC but are missing in the kernel header. Signed-off-by: Pavan Holla Signed-off-by: Łukasz Bartosik Acked-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20240910101527.603452-3-ukaszb@chromium.org Signed-off-by: Lee Jones --- include/linux/platform_data/cros_ec_commands.h | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index e574b790be6f..b3c4993e656e 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -1312,6 +1312,38 @@ enum ec_feature_code { * The EC supports the AP composing VDMs for us to send. */ EC_FEATURE_TYPEC_AP_VDM_SEND = 46, + /* + * The EC supports system safe mode panic recovery. + */ + EC_FEATURE_SYSTEM_SAFE_MODE = 47, + /* + * The EC will reboot on runtime assertion failures. + */ + EC_FEATURE_ASSERT_REBOOTS = 48, + /* + * The EC image is built with tokenized logging enabled. + */ + EC_FEATURE_TOKENIZED_LOGGING = 49, + /* + * The EC supports triggering an STB dump. + */ + EC_FEATURE_AMD_STB_DUMP = 50, + /* + * The EC supports memory dump commands. + */ + EC_FEATURE_MEMORY_DUMP = 51, + /* + * The EC supports DP2.1 capability + */ + EC_FEATURE_TYPEC_DP2_1 = 52, + /* + * The MCU is System Companion Processor Core 1 + */ + EC_FEATURE_SCP_C1 = 53, + /* + * The EC supports UCSI PPM. + */ + EC_FEATURE_UCSI_PPM = 54, }; #define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32) -- cgit v1.2.3 From 522249f05c5551aec9ec0ba9b6438f1ec19c138d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 3 Oct 2024 16:29:22 +0200 Subject: fanotify: allow reporting errors on failure to open fd When working in "fd mode", fanotify_read() needs to open an fd from a dentry to report event->fd to userspace. Opening an fd from dentry can fail for several reasons. For example, when tasks are gone and we try to open their /proc files or we try to open a WRONLY file like in sysfs or when trying to open a file that was deleted on the remote network server. Add a new flag FAN_REPORT_FD_ERROR for fanotify_init(). For a group with FAN_REPORT_FD_ERROR, we will send the event with the error instead of the open fd, otherwise userspace may not get the error at all. For an overflow event, we report -EBADF to avoid confusing FAN_NOFD with -EPERM. Similarly for pidfd open errors we report either -ESRCH or the open error instead of FAN_NOPIDFD and FAN_EPIDFD. In any case, userspace will not know which file failed to open, so add a debug print for further investigation. Reported-by: Krishna Vivek Vitta Link: https://lore.kernel.org/linux-fsdevel/SI2P153MB07182F3424619EDDD1F393EED46D2@SI2P153MB0718.APCP153.PROD.OUTLOOK.COM/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20241003142922.111539-1-amir73il@gmail.com --- include/linux/fanotify.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 4f1c4f603118..89ff45bd6f01 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -36,6 +36,7 @@ #define FANOTIFY_ADMIN_INIT_FLAGS (FANOTIFY_PERM_CLASSES | \ FAN_REPORT_TID | \ FAN_REPORT_PIDFD | \ + FAN_REPORT_FD_ERROR | \ FAN_UNLIMITED_QUEUE | \ FAN_UNLIMITED_MARKS) -- cgit v1.2.3 From d6083f040d5d8f8d748462c77e90547097df936e Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 15 Oct 2024 23:02:06 +0800 Subject: bpf: Prevent tailcall infinite loop caused by freplace There is a potential infinite loop issue that can occur when using a combination of tail calls and freplace. In an upcoming selftest, the attach target for entry_freplace of tailcall_freplace.c is subprog_tc of tc_bpf2bpf.c, while the tail call in entry_freplace leads to entry_tc. This results in an infinite loop: entry_tc -> subprog_tc -> entry_freplace --tailcall-> entry_tc. The problem arises because the tail_call_cnt in entry_freplace resets to zero each time entry_freplace is executed, causing the tail call mechanism to never terminate, eventually leading to a kernel panic. To fix this issue, the solution is twofold: 1. Prevent updating a program extended by an freplace program to a prog_array map. 2. Prevent extending a program that is already part of a prog_array map with an freplace program. This ensures that: * If a program or its subprogram has been extended by an freplace program, it can no longer be updated to a prog_array map. * If a program has been added to a prog_array map, neither it nor its subprograms can be extended by an freplace program. Moreover, an extension program should not be tailcalled. As such, return -EINVAL if the program has a type of BPF_PROG_TYPE_EXT when adding it to a prog_array map. Additionally, fix a minor code style issue by replacing eight spaces with a tab for proper formatting. Reviewed-by: Eduard Zingerman Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20241015150207.70264-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 19d8ca8ac960..0c216e71cec7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1292,8 +1292,12 @@ void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog); +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); @@ -1374,12 +1378,14 @@ void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link, - struct bpf_trampoline *tr) + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { return -ENOTSUPP; } static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, - struct bpf_trampoline *tr) + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { return -ENOTSUPP; } @@ -1483,6 +1489,9 @@ struct bpf_prog_aux { bool xdp_has_frags; bool exception_cb; bool exception_boundary; + bool is_extended; /* true if extended by freplace program */ + u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ + struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; -- cgit v1.2.3 From ce1dfe6d328966b75821c1f043a940eb2569768a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 12 Oct 2024 20:32:43 +0900 Subject: PCI: endpoint: Introduce pci_epc_mem_map()/unmap() Some endpoint controllers have requirements on the alignment of the controller physical memory address that must be used to map a RC PCI address region. For instance, the endpoint controller of the RK3399 SoC uses at most the lower 20 bits of a physical memory address region as the lower bits of a RC PCI address region. For mapping a PCI address region of size bytes starting from pci_addr, the exact number of address bits used is the number of address bits changing in the address range [pci_addr..pci_addr + size - 1]. For this example, this creates the following constraints: 1) The offset into the controller physical memory allocated for a mapping depends on the mapping size *and* the starting PCI address for the mapping. 2) A mapping size cannot exceed the controller windows size (1MB) minus the offset needed into the allocated physical memory, which can end up being a smaller size than the desired mapping size. Handling these constraints independently of the controller being used in an endpoint function driver is not possible with the current EPC API as only the ->align field in struct pci_epc_features is provided but used for BAR (inbound ATU mappings) mapping only. A new API is needed for function drivers to discover mapping constraints and handle non-static requirements based on the RC PCI address range to access. Introduce the endpoint controller operation ->align_addr() to allow the EPC core functions to obtain the size and the offset into a controller address region that must be allocated and mapped to access a RC PCI address region. The size of the mapping provided by the align_addr() operation can then be used as the size argument for the function pci_epc_mem_alloc_addr() and the offset into the allocated controller memory provided can be used to correctly handle data transfers. For endpoint controllers that have PCI address alignment constraints, the align_addr() operation may indicate upon return an effective PCI address mapping size that is smaller (but not 0) than the requested PCI address region size. The controller ->align_addr() operation is optional: controllers that do not have any alignment constraints for mapping RC PCI address regions do not need to implement this operation. For such controllers, it is always assumed that the mapping size is equal to the requested size of the PCI region and that the mapping offset is 0. The function pci_epc_mem_map() is introduced to use this new controller operation (if it is defined) to handle controller memory allocation and mapping to a RC PCI address region in endpoint function drivers. This function first uses the ->align_addr() controller operation to determine the controller memory address size (and offset into) needed for mapping an RC PCI address region. The result of this operation is used to allocate a controller physical memory region using pci_epc_mem_alloc_addr() and then to map that memory to the RC PCI address space with pci_epc_map_addr(). Since ->align_addr() () may indicate that not all of a RC PCI address region can be mapped, pci_epc_mem_map() may only partially map the RC PCI address region specified. It is the responsibility of the caller (an endpoint function driver) to handle such smaller mapping by repeatedly using pci_epc_mem_map() over the desried PCI address range. The counterpart of pci_epc_mem_map() to unmap and free a mapped controller memory address region is pci_epc_mem_unmap(). Both functions operate using the new struct pci_epc_map data structure. This new structure represents a mapping PCI address, mapping effective size, the size of the controller memory needed for the mapping as well as the physical and virtual CPU addresses of the mapping (phys_base and virt_base fields). For convenience, the physical and virtual CPU addresses within that mapping to use to access the target RC PCI address region are also provided (phys_addr and virt_addr fields). Endpoint function drivers can use struct pci_epc_map to access the mapped RC PCI address region using the ->virt_addr and ->pci_size fields. Co-developed-by: Rick Wertenbroek Signed-off-by: Rick Wertenbroek Signed-off-by: Damien Le Moal Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241012113246.95634-4-dlemoal@kernel.org [mani: squashed the patch that changed phy_addr_t to u64] Signed-off-by: Manivannan Sadhasivam --- include/linux/pci-epc.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 42ef06136bd1..de8cc3658220 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -32,11 +32,43 @@ pci_epc_interface_string(enum pci_epc_interface_type type) } } +/** + * struct pci_epc_map - information about EPC memory for mapping a RC PCI + * address range + * @pci_addr: start address of the RC PCI address range to map + * @pci_size: size of the RC PCI address range mapped from @pci_addr + * @map_pci_addr: RC PCI address used as the first address mapped (may be lower + * than @pci_addr) + * @map_size: size of the controller memory needed for mapping the RC PCI address + * range @pci_addr..@pci_addr+@pci_size + * @phys_base: base physical address of the allocated EPC memory for mapping the + * RC PCI address range + * @phys_addr: physical address at which @pci_addr is mapped + * @virt_base: base virtual address of the allocated EPC memory for mapping the + * RC PCI address range + * @virt_addr: virtual address at which @pci_addr is mapped + */ +struct pci_epc_map { + u64 pci_addr; + size_t pci_size; + + u64 map_pci_addr; + size_t map_size; + + phys_addr_t phys_base; + phys_addr_t phys_addr; + void __iomem *virt_base; + void __iomem *virt_addr; +}; + /** * struct pci_epc_ops - set of function pointers for performing EPC operations * @write_header: ops to populate configuration space header * @set_bar: ops to configure the BAR * @clear_bar: ops to reset the BAR + * @align_addr: operation to get the mapping address, mapping size and offset + * into a controller memory window needed to map an RC PCI address + * region * @map_addr: ops to map CPU address to PCI address * @unmap_addr: ops to unmap CPU address and PCI address * @set_msi: ops to set the requested number of MSI interrupts in the MSI @@ -61,6 +93,8 @@ struct pci_epc_ops { struct pci_epf_bar *epf_bar); void (*clear_bar)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); + u64 (*align_addr)(struct pci_epc *epc, u64 pci_addr, size_t *size, + size_t *offset); int (*map_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t addr, u64 pci_addr, size_t size); void (*unmap_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, @@ -278,6 +312,10 @@ void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc, phys_addr_t *phys_addr, size_t size); void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr, void __iomem *virt_addr, size_t size); +int pci_epc_mem_map(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + u64 pci_addr, size_t pci_size, struct pci_epc_map *map); +void pci_epc_mem_unmap(struct pci_epc *epc, u8 func_no, u8 vfunc_no, + struct pci_epc_map *map); #else static inline void pci_epc_init_notify(struct pci_epc *epc) -- cgit v1.2.3 From 5280a14a6079040205a1d968cd80f20448d047c7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Oct 2024 12:09:32 -0700 Subject: genirq: Introduce irq_get_nr_irqs() and irq_set_nr_irqs() Prepare for changing 'nr_irqs' from an exported global variable into a variable with file scope. This will prevent accidental changes of assignments to a local variable 'nr_irqs' into assignments to the global 'nr_irqs' variable. Suppose that a patch would be submitted for review that removes a declaration of a local variable with the name 'nr_irqs' and that that patch does not remove all assignments to that local variable. Such a patch converts an assignment to a local variable into an assignment into a global variable. If the 'nr_irqs' assignment is more than three lines away from other changes, the assignment won't be included in the diff context lines and hence won't be visible without inspecting the modified file. With these abstraction series applied, such accidental conversions from assignments to a local variable into an assignment to a global variable are converted into a compilation error. Signed-off-by: Bart Van Assche Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241015190953.1266194-2-bvanassche@acm.org --- include/linux/irqnr.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 3496baa0b07f..7419b807b71b 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -6,6 +6,8 @@ extern int nr_irqs; +unsigned int irq_get_nr_irqs(void) __pure; +unsigned int irq_set_nr_irqs(unsigned int nr); extern struct irq_desc *irq_to_desc(unsigned int irq); unsigned int irq_get_next_irq(unsigned int offset); -- cgit v1.2.3 From 1ad2048bf7146efb83bc033147ca1611a7fe8494 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Oct 2024 12:09:52 -0700 Subject: genirq: Switch to irq_get_nr_irqs() Use the irq_get_nr_irqs() function instead of the global variable 'nr_irqs'. Cache the result of this function in a local variable in order not to rely on CSE (common subexpression elimination). Prepare for changing 'nr_irqs' from an exported global variable into a variable with file scope. Signed-off-by: Bart Van Assche Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241015190953.1266194-22-bvanassche@acm.org --- include/linux/irqnr.h | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 7419b807b71b..a33088d27c54 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -11,26 +11,31 @@ unsigned int irq_set_nr_irqs(unsigned int nr); extern struct irq_desc *irq_to_desc(unsigned int irq); unsigned int irq_get_next_irq(unsigned int offset); -# define for_each_irq_desc(irq, desc) \ - for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; \ - irq++, desc = irq_to_desc(irq)) \ - if (!desc) \ - ; \ - else - +#define for_each_irq_desc(irq, desc) \ + for (unsigned int __nr_irqs__ = irq_get_nr_irqs(); __nr_irqs__; \ + __nr_irqs__ = 0) \ + for (irq = 0, desc = irq_to_desc(irq); irq < __nr_irqs__; \ + irq++, desc = irq_to_desc(irq)) \ + if (!desc) \ + ; \ + else # define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; \ - irq--, desc = irq_to_desc(irq)) \ + for (irq = irq_get_nr_irqs() - 1, desc = irq_to_desc(irq); \ + irq >= 0; irq--, desc = irq_to_desc(irq)) \ if (!desc) \ ; \ else -# define for_each_active_irq(irq) \ - for (irq = irq_get_next_irq(0); irq < nr_irqs; \ - irq = irq_get_next_irq(irq + 1)) +#define for_each_active_irq(irq) \ + for (unsigned int __nr_irqs__ = irq_get_nr_irqs(); __nr_irqs__; \ + __nr_irqs__ = 0) \ + for (irq = irq_get_next_irq(0); irq < __nr_irqs__; \ + irq = irq_get_next_irq(irq + 1)) -#define for_each_irq_nr(irq) \ - for (irq = 0; irq < nr_irqs; irq++) +#define for_each_irq_nr(irq) \ + for (unsigned int __nr_irqs__ = irq_get_nr_irqs(); __nr_irqs__; \ + __nr_irqs__ = 0) \ + for (irq = 0; irq < __nr_irqs__; irq++) #endif -- cgit v1.2.3 From ef4c675dc2961ee533bdc1ea20390761df0af5be Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Oct 2024 12:09:53 -0700 Subject: genirq: Unexport nr_irqs Unexport nr_irqs and declare it static now that all code that reads or modifies nr_irqs has been converted to number_of_interrupts() / set_number_of_interrupts(). Change the type of 'nr_irqs' from 'int' into 'unsigned int' to match the return type and argument type of the irq_get_nr_iqs() / irq_set_nr_irqs() functions. Signed-off-by: Bart Van Assche Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241015190953.1266194-23-bvanassche@acm.org --- include/linux/irqnr.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index a33088d27c54..e97206c721a0 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -5,7 +5,6 @@ #include -extern int nr_irqs; unsigned int irq_get_nr_irqs(void) __pure; unsigned int irq_set_nr_irqs(unsigned int nr); extern struct irq_desc *irq_to_desc(unsigned int irq); -- cgit v1.2.3 From cf70da29c4993bf23df68b67a82dfa3da8234e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 5 Oct 2024 12:06:16 +0200 Subject: power: supply: core: unexport power_supply_property_is_writeable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit ("power: supply: Drop use_cnt check from power_supply_property_is_writeable()"), this function does not check use_cnt anymore, making it unsuitable for general usage. As it is only used by the psy core anyways, remove it from the public header and unexport it to avoid misusage. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241005-power-supply-cleanups-v1-2-45303b2d0a4d@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 14eedefb3ac8..c1d6cb7f4c40 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -865,8 +865,6 @@ static inline int power_supply_set_property(struct power_supply *psy, const union power_supply_propval *val) { return 0; } #endif -extern int power_supply_property_is_writeable(struct power_supply *psy, - enum power_supply_property psp); extern void power_supply_external_power_changed(struct power_supply *psy); extern struct power_supply *__must_check -- cgit v1.2.3 From 8060bcb109f2d9b85451e84a7a08042da40368df Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 16 Oct 2024 16:18:31 +0300 Subject: usb: typec: Add attribute file showing the supported USB modes of the port This attribute file, named "usb_capability", will show the supported USB modes, which are USB 2.0, USB 3.2 and USB4. These modes are defined in the USB Type-C (R2.0) and USB Power Delivery (R3.0 V2.0) Specifications. Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20241016131834.898599-2-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index 549275f8ac1b..f7edced5b10b 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -87,6 +87,17 @@ enum typec_orientation { TYPEC_ORIENTATION_REVERSE, }; +enum usb_mode { + USB_MODE_NONE, + USB_MODE_USB2, + USB_MODE_USB3, + USB_MODE_USB4 +}; + +#define USB_CAPABILITY_USB2 BIT(0) +#define USB_CAPABILITY_USB3 BIT(1) +#define USB_CAPABILITY_USB4 BIT(2) + /* * struct enter_usb_data - Enter_USB Message details * @eudo: Enter_USB Data Object @@ -240,6 +251,7 @@ struct typec_partner_desc { * @port_type_set: Set port type * @pd_get: Get available USB Power Delivery Capabilities. * @pd_set: Set USB Power Delivery Capabilities. + * @default_usb_mode_set: USB Mode to be used by default with Enter_USB Message */ struct typec_operations { int (*try_role)(struct typec_port *port, int role); @@ -250,6 +262,7 @@ struct typec_operations { enum typec_port_type type); struct usb_power_delivery **(*pd_get)(struct typec_port *port); int (*pd_set)(struct typec_port *port, struct usb_power_delivery *pd); + int (*default_usb_mode_set)(struct typec_port *port, enum usb_mode mode); }; enum usb_pd_svdm_ver { @@ -267,6 +280,7 @@ enum usb_pd_svdm_ver { * @svdm_version: USB PD Structured VDM version if supported * @prefer_role: Initial role preference (DRP ports). * @accessory: Supported Accessory Modes + * @usb_capability: Supported USB Modes * @fwnode: Optional fwnode of the port * @driver_data: Private pointer for driver specific info * @pd: Optional USB Power Delivery Support @@ -283,6 +297,7 @@ struct typec_capability { int prefer_role; enum typec_accessory accessory[TYPEC_MAX_ACCESSORY]; unsigned int orientation_aware:1; + u8 usb_capability; struct fwnode_handle *fwnode; void *driver_data; @@ -350,6 +365,8 @@ int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_ int typec_partner_set_usb_power_delivery(struct typec_partner *partner, struct usb_power_delivery *pd); +void typec_port_set_usb_mode(struct typec_port *port, enum usb_mode mode); + /** * struct typec_connector - Representation of Type-C port for external drivers * @attach: notification about device removal -- cgit v1.2.3 From 2140a952c4e9c73993ae6d9c2cc674d263d4beab Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 16 Oct 2024 16:18:32 +0300 Subject: usb: typec: Add attribute file showing the USB Modes of the partner This attribute file shows the supported USB modes (USB 2.0, USB 3.0 and USB4) of the partner, and the currently active mode. The active mode is determined primarily by checking the speed of the enumerated USB device. When USB Power Delivery is supported, the active USB mode should be always the mode that was used with the Enter_USB Message, regardless of the result of the USB enumeration. The port drivers can separately assign the mode with a dedicated API. If USB Power Delivery Identity is supplied for the partner device, the supported modes are extracted from it. Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20241016131834.898599-3-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index f7edced5b10b..d616b8807000 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -220,6 +220,7 @@ struct typec_cable_desc { * @accessory: Audio, Debug or none. * @identity: Discover Identity command data * @pd_revision: USB Power Delivery Specification Revision if supported + * @usb_capability: Supported USB Modes * @attach: Notification about attached USB device * @deattach: Notification about removed USB device * @@ -237,6 +238,7 @@ struct typec_partner_desc { enum typec_accessory accessory; struct usb_pd_identity *identity; u16 pd_revision; /* 0300H = "3.0" */ + u8 usb_capability; void (*attach)(struct typec_partner *partner, struct device *dev); void (*deattach)(struct typec_partner *partner, struct device *dev); @@ -252,6 +254,7 @@ struct typec_partner_desc { * @pd_get: Get available USB Power Delivery Capabilities. * @pd_set: Set USB Power Delivery Capabilities. * @default_usb_mode_set: USB Mode to be used by default with Enter_USB Message + * @enter_usb_mode: Change the active USB Mode */ struct typec_operations { int (*try_role)(struct typec_port *port, int role); @@ -263,6 +266,7 @@ struct typec_operations { struct usb_power_delivery **(*pd_get)(struct typec_port *port); int (*pd_set)(struct typec_port *port, struct usb_power_delivery *pd); int (*default_usb_mode_set)(struct typec_port *port, enum usb_mode mode); + int (*enter_usb_mode)(struct typec_port *port, enum usb_mode mode); }; enum usb_pd_svdm_ver { @@ -365,6 +369,7 @@ int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_ int typec_partner_set_usb_power_delivery(struct typec_partner *partner, struct usb_power_delivery *pd); +void typec_partner_set_usb_mode(struct typec_partner *partner, enum usb_mode usb_mode); void typec_port_set_usb_mode(struct typec_port *port, enum usb_mode mode); /** -- cgit v1.2.3 From f35533a0e60946ee3fb8adccf8a36024c6f1fe40 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 16 Oct 2024 18:23:23 +0800 Subject: soundwire: sdw_intel: include linux/acpi.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the acpi_handle stuff. Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Acked-by: Vinod Koul Link: https://patch.msgid.link/20241016102333.294448-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_intel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 37ae69365fe2..fae345987b8c 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -4,6 +4,7 @@ #ifndef __SDW_INTEL_H #define __SDW_INTEL_H +#include #include #include -- cgit v1.2.3 From 4b224ff80d6609811ec6ab5406a16c92825cfb1a Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Wed, 16 Oct 2024 18:23:24 +0800 Subject: ASoC/soundwire: remove sdw_slave_extended_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This structure is used to copy information from the 'sdw_slave' structures, it's better to create a flexible array of 'sdw_slave' pointers and directly access the information. This will also help access additional information stored in the 'sdw_slave' structure, such as an SDCA context. This patch does not add new functionality, it only modified how the information is retrieved. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241016102333.294448-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw.h | 6 +++--- include/linux/soundwire/sdw_amd.h | 7 ++----- include/linux/soundwire/sdw_intel.h | 7 ++----- 3 files changed, 7 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 5e0dd47a0412..283c8bfdde49 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -488,9 +488,9 @@ struct sdw_slave_id { __u8 sdw_version:4; }; -struct sdw_extended_slave_id { - int link_id; - struct sdw_slave_id id; +struct sdw_peripherals { + int num_peripherals; + struct sdw_slave *array[]; }; /* diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index 28a4eb77717f..585b4c58a8a6 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -115,19 +115,16 @@ struct sdw_amd_acpi_info { * struct sdw_amd_ctx - context allocated by the controller driver probe * * @count: link count - * @num_slaves: total number of devices exposed across all enabled links * @link_mask: bit-wise mask listing SoundWire links reported by the * Controller - * @ids: array of slave_id, representing Slaves exposed across all enabled - * links * @pdev: platform device structure + * @peripherals: array representing Peripherals exposed across all enabled links */ struct sdw_amd_ctx { int count; - int num_slaves; u32 link_mask; - struct sdw_extended_slave_id *ids; struct platform_device *pdev[AMD_SDW_MAX_MANAGER_COUNT]; + struct sdw_peripherals *peripherals; }; /** diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index fae345987b8c..491ddd27270f 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -287,31 +287,28 @@ struct hdac_bus; * hardware capabilities after all power dependencies are settled. * @link_mask: bit-wise mask listing SoundWire links reported by the * Controller - * @num_slaves: total number of devices exposed across all enabled links * @handle: ACPI parent handle * @ldev: information for each link (controller-specific and kept * opaque here) - * @ids: array of slave_id, representing Slaves exposed across all enabled - * links * @link_list: list to handle interrupts across all links * @shim_lock: mutex to handle concurrent rmw access to shared SHIM registers. * @shim_mask: flags to track initialization of SHIM shared registers * @shim_base: sdw shim base. * @alh_base: sdw alh base. + * @peripherals: array representing Peripherals exposed across all enabled links */ struct sdw_intel_ctx { int count; void __iomem *mmio_base; u32 link_mask; - int num_slaves; acpi_handle handle; struct sdw_intel_link_dev **ldev; - struct sdw_extended_slave_id *ids; struct list_head link_list; struct mutex shim_lock; /* lock for access to shared SHIM registers */ u32 shim_mask; u32 shim_base; u32 alh_base; + struct sdw_peripherals *peripherals; }; /** -- cgit v1.2.3 From 3a513da1ae33972e59efeef7908061f1f24af480 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Wed, 16 Oct 2024 18:23:25 +0800 Subject: ASoC: SDCA: add initial module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new module for SDCA (SoundWire Device Class for Audio) support. For now just add a parser to identify the SDCA revision and the function mask. Note that the SDCA definitions and related MIPI DisCo properties are defined only for ACPI platforms and extracted with _DSD helpers. There is currently no support for Device Tree in the specification, the 'depends on ACPI' reflects this design limitation. This might change in a future revision of the specification but for SDCA 1.0 ACPI is the only supported type of platform firmware. The SDCA library is defined with static inline fallbacks, which will allow for unconditional addition of SDCA support in common parts of the code. The design follows a four-step process: 1) Basic information related to Functions is extracted from MIPI DisCo tables and stored in the 'struct sdw_slave'. Devm_ based memory allocation is not allowed at this point prior to a driver probe, so we only store the function node, address and type. 2) When a codec driver probes, it will register subdevices for each Function identified in phase 1) 3) a driver will probe for each subdevice and addition parsing/memory allocation takes place at this level. devm_ based allocation is highly encouraged to make error handling manageable. 4) Before the peripheral device becomes physically attached, register access is not permitted and the regmaps are cache-only. When peripheral device is enumerated, the bus level uses the 'update_status' notification; after optional device-level initialization, the codec driver will notify each of the subdevices so that they can start interacting with the hardware. Note that the context extracted in 1) should be arguably be handled completely in the codec driver probe. That would however make it difficult to use the ACPI information for machine quirks, and e.g. select different machine driver and topologies as done for the RT712_VB handling later in the series. To make the implementation of quirks simpler, this patchset extracts a minimal amount of context (interface revision and number/type of Functions) before the codec driver probe, and stores this context in the scope of the 'struct sdw_slave'. The SDCA library can also be used in a vendor-specific driver without creating subdevices, e.g. to retrieve the 'initialization-table' values to write platform-specific values as needed. For more technical details, the SDCA specification is available for public downloads at https://www.mipi.org/mipi-sdca-v1-0-download Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241016102333.294448-4-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 283c8bfdde49..49d690f3d29a 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -10,6 +10,7 @@ #include #include #include +#include struct sdw_bus; struct sdw_slave; @@ -663,6 +664,7 @@ struct sdw_slave_ops { * @is_mockup_device: status flag used to squelch errors in the command/control * protocol for SoundWire mockup devices * @sdw_dev_lock: mutex used to protect callbacks/remove races + * @sdca_data: structure containing all device data for SDCA helpers */ struct sdw_slave { struct sdw_slave_id id; @@ -686,6 +688,7 @@ struct sdw_slave { bool first_interrupt_done; bool is_mockup_device; struct mutex sdw_dev_lock; /* protect callbacks/remove races */ + struct sdca_device_data sdca_data; }; #define dev_to_sdw_dev(_dev) container_of(_dev, struct sdw_slave, dev) -- cgit v1.2.3 From 6a136805e3c1532d2414b298c024429943cfd482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Mon, 7 Oct 2024 15:49:18 +0200 Subject: clk: divider: Introduce CLK_DIVIDER_EVEN_INTEGERS flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add CLK_DIVIDER_EVEN_INTEGERS flag to support divisor of 2, 4, 6, etc. The same divisor can be done using a table, which would be big and wasteful for a clock dividor of width 8 (256 entries). Require increasing flags size from u8 to u16 because CLK_DIVIDER_EVEN_INTEGERS is the eighth flag. u16 is used inside struct clk_divider; `unsigned long` is used for function arguments. Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20241007-mbly-clk-v5-3-e9d8994269cb@bootlin.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 7e43caabb54b..dbe793964c24 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -689,13 +689,15 @@ struct clk_div_table { * CLK_DIVIDER_BIG_ENDIAN - By default little endian register accesses are used * for the divider register. Setting this flag makes the register accesses * big endian. + * CLK_DIVIDER_EVEN_INTEGERS - clock divisor is 2, 4, 6, 8, 10, etc. + * Formula is 2 * (value read from hardware + 1). */ struct clk_divider { struct clk_hw hw; void __iomem *reg; u8 shift; u8 width; - u8 flags; + u16 flags; const struct clk_div_table *table; spinlock_t *lock; }; @@ -711,6 +713,7 @@ struct clk_divider { #define CLK_DIVIDER_READ_ONLY BIT(5) #define CLK_DIVIDER_MAX_AT_ZERO BIT(6) #define CLK_DIVIDER_BIG_ENDIAN BIT(7) +#define CLK_DIVIDER_EVEN_INTEGERS BIT(8) extern const struct clk_ops clk_divider_ops; extern const struct clk_ops clk_divider_ro_ops; @@ -740,19 +743,21 @@ struct clk_hw *__clk_hw_register_divider(struct device *dev, struct device_node *np, const char *name, const char *parent_name, const struct clk_hw *parent_hw, const struct clk_parent_data *parent_data, unsigned long flags, - void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags, + void __iomem *reg, u8 shift, u8 width, + unsigned long clk_divider_flags, const struct clk_div_table *table, spinlock_t *lock); struct clk_hw *__devm_clk_hw_register_divider(struct device *dev, struct device_node *np, const char *name, const char *parent_name, const struct clk_hw *parent_hw, const struct clk_parent_data *parent_data, unsigned long flags, - void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags, + void __iomem *reg, u8 shift, u8 width, + unsigned long clk_divider_flags, const struct clk_div_table *table, spinlock_t *lock); struct clk *clk_register_divider_table(struct device *dev, const char *name, const char *parent_name, unsigned long flags, void __iomem *reg, u8 shift, u8 width, - u8 clk_divider_flags, const struct clk_div_table *table, - spinlock_t *lock); + unsigned long clk_divider_flags, + const struct clk_div_table *table, spinlock_t *lock); /** * clk_register_divider - register a divider clock with the clock framework * @dev: device registering this clock -- cgit v1.2.3 From 44fcc479a574a4055fb6aed1f786d39999466383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 17 Oct 2024 06:37:33 +0200 Subject: power: supply: hwmon: move interface to private header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interface of power_supply_hwmon.c is only meant to be used by the psy core. Remove it from the public header file and use the private one instead. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241017-power-supply-cleanups-v2-1-cb0f5deab088@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index c1d6cb7f4c40..b98106e1a90f 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -937,19 +937,6 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp) return false; } -#ifdef CONFIG_POWER_SUPPLY_HWMON -int power_supply_add_hwmon_sysfs(struct power_supply *psy); -void power_supply_remove_hwmon_sysfs(struct power_supply *psy); -#else -static inline int power_supply_add_hwmon_sysfs(struct power_supply *psy) -{ - return 0; -} - -static inline -void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {} -#endif - #ifdef CONFIG_SYSFS ssize_t power_supply_charge_behaviour_show(struct device *dev, unsigned int available_behaviours, -- cgit v1.2.3 From 0784181b44af831a3fa52e1e5ff77c388d699dba Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 26 Sep 2024 16:17:37 +0100 Subject: lockdep: Add lockdep_cleanup_dead_cpu() Add a function to check that an offline CPU has left the tracing infrastructure in a sane state. Commit 9bb69ba4c177 ("ACPI: processor_idle: use raw_safe_halt() in acpi_idle_play_dead()") fixed an issue where the acpi_idle_play_dead() function called safe_halt() instead of raw_safe_halt(), which had the side-effect of setting the hardirqs_enabled flag for the offline CPU. On x86 this triggered warnings from lockdep_assert_irqs_disabled() when the CPU was brought back online again later. These warnings were too early for the exception to be handled correctly, leading to a triple-fault. Add lockdep_cleanup_dead_cpu() to check for this kind of failure mode, print the events leading up to it, and correct it so that the CPU can come online again correctly. Re-introducing the original bug now merely results in this warning instead: [ 61.556652] smpboot: CPU 1 is now offline [ 61.556769] CPU 1 left hardirqs enabled! [ 61.556915] irq event stamp: 128149 [ 61.556965] hardirqs last enabled at (128149): [] acpi_idle_play_dead+0x46/0x70 [ 61.557055] hardirqs last disabled at (128148): [] do_idle+0x90/0xe0 [ 61.557117] softirqs last enabled at (128078): [] __do_softirq+0x31c/0x423 [ 61.557199] softirqs last disabled at (128065): [] __irq_exit_rcu+0x91/0x100 [boqun: Capitalize the title and reword the message a bit] Signed-off-by: David Woodhouse Reviewed-by: Thomas Gleixner Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/f7bd2b3b999051bb3ef4be34526a9262008285f5.camel@infradead.org --- include/linux/irqflags.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 3f003d5fde53..57b074e0cfbb 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -18,6 +18,8 @@ #include #include +struct task_struct; + /* Currently lockdep_softirqs_on/off is used only by lockdep */ #ifdef CONFIG_PROVE_LOCKING extern void lockdep_softirqs_on(unsigned long ip); @@ -25,12 +27,16 @@ extern void lockdep_hardirqs_on_prepare(void); extern void lockdep_hardirqs_on(unsigned long ip); extern void lockdep_hardirqs_off(unsigned long ip); + extern void lockdep_cleanup_dead_cpu(unsigned int cpu, + struct task_struct *idle); #else static inline void lockdep_softirqs_on(unsigned long ip) { } static inline void lockdep_softirqs_off(unsigned long ip) { } static inline void lockdep_hardirqs_on_prepare(void) { } static inline void lockdep_hardirqs_on(unsigned long ip) { } static inline void lockdep_hardirqs_off(unsigned long ip) { } + static inline void lockdep_cleanup_dead_cpu(unsigned int cpu, + struct task_struct *idle) {} #endif #ifdef CONFIG_TRACE_IRQFLAGS -- cgit v1.2.3 From d7fe143cb115076fed0126ad8cf5ba6c3e575e43 Mon Sep 17 00:00:00 2001 From: Ahmed Ehab Date: Sun, 25 Aug 2024 01:10:30 +0300 Subject: locking/lockdep: Avoid creating new name string literals in lockdep_set_subclass() Syzbot reports a problem that a warning will be triggered while searching a lock class in look_up_lock_class(). The cause of the issue is that a new name is created and used by lockdep_set_subclass() instead of using the existing one. This results in a lock instance has a different name pointer than previous registered one stored in lock class, and WARN_ONCE() is triggered because of that in look_up_lock_class(). To fix this, change lockdep_set_subclass() to use the existing name instead of a new one. Hence, no new name will be created by lockdep_set_subclass(). Hence, the warning is avoided. [boqun: Reword the commit log to state the correct issue] Reported-by: Fixes: de8f5e4f2dc1f ("lockdep: Introduce wait-type checks") Cc: stable@vger.kernel.org Signed-off-by: Ahmed Ehab Signed-off-by: Boqun Feng Link: https://lore.kernel.org/lkml/20240824221031.7751-1-bottaawesome633@gmail.com/ --- include/linux/lockdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 217f7abf2cbf..67964dc4db95 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -173,7 +173,7 @@ static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, (lock)->dep_map.lock_type) #define lockdep_set_subclass(lock, sub) \ - lockdep_init_map_type(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\ + lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) -- cgit v1.2.3 From 6eaa83ec229b1e99130456c4f6658430d509c76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 17 Oct 2024 17:11:08 +0300 Subject: PCI: Remove unused PCI_SUBTRACTIVE_DECODE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2fe2abf896c1 ("PCI: augment bus resource table with a list") added PCI_SUBTRACTIVE_DECODE which is put into the struct pci_bus_resource flags field but is never read. There seems to never have been users for it. Remove both PCI_SUBTRACTIVE_DECODE and the flags field from the struct pci_bus_resource. Link: https://lore.kernel.org/r/20241017141111.44612-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron --- include/linux/pci.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 573b4c4c2be6..6a9cf80d0d4b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -633,18 +633,9 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge); * Use pci_bus_for_each_resource() to iterate through all the resources. */ -/* - * PCI_SUBTRACTIVE_DECODE means the bridge forwards the window implicitly - * and there's no way to program the bridge with the details of the window. - * This does not apply to ACPI _CRS windows, even with the _DEC subtractive- - * decode bit set, because they are explicit and can be programmed with _SRS. - */ -#define PCI_SUBTRACTIVE_DECODE 0x1 - struct pci_bus_resource { struct list_head list; struct resource *res; - unsigned int flags; }; #define PCI_REGION_FLAG_MASK 0x0fU /* These bits of resource flags tell us the PCI region flags */ @@ -1498,8 +1489,7 @@ void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, resource_size_t offset); void pci_free_resource_list(struct list_head *resources); -void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, - unsigned int flags); +void pci_bus_add_resource(struct pci_bus *bus, struct resource *res); struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n); void pci_bus_remove_resources(struct pci_bus *bus); void pci_bus_remove_resource(struct pci_bus *bus, struct resource *res); -- cgit v1.2.3 From 469c9cb941480718db52876bbdb95a7a79b34889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 17 Oct 2024 17:11:09 +0300 Subject: PCI: Move struct pci_bus_resource into bus.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct pci_bus_resource is only used in bus.c, so move it there. Link: https://lore.kernel.org/r/20241017141111.44612-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron --- include/linux/pci.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 6a9cf80d0d4b..5a9d849b28ef 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -626,18 +626,6 @@ void pci_set_host_bridge_release(struct pci_host_bridge *bridge, int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge); -/* - * The first PCI_BRIDGE_RESOURCE_NUM PCI bus resources (those that correspond - * to P2P or CardBus bridge windows) go in a table. Additional ones (for - * buses below host bridges or subtractive decode bridges) go in the list. - * Use pci_bus_for_each_resource() to iterate through all the resources. - */ - -struct pci_bus_resource { - struct list_head list; - struct resource *res; -}; - #define PCI_REGION_FLAG_MASK 0x0fU /* These bits of resource flags tell us the PCI region flags */ struct pci_bus { -- cgit v1.2.3 From 08ef26ea9ab315b895d57f8fbad41e02ff345bb9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 7 Oct 2024 16:23:58 +0200 Subject: fs: add file_ref As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop it has O(N^2) behaviour under contention with N concurrent operations and it is in a hot path in __fget_files_rcu(). The rcuref infrastructures remedies this problem by using an unconditional increment relying on safe- and dead zones to make this work and requiring rcu protection for the data structure in question. This not just scales better it also introduces overflow protection. However, in contrast to generic rcuref, files require a memory barrier and thus cannot rely on *_relaxed() atomic operations and also require to be built on atomic_long_t as having massive amounts of reference isn't unheard of even if it is just an attack. As suggested by Linus, add a file specific variant instead of making this a generic library. Files are SLAB_TYPESAFE_BY_RCU and thus don't have "regular" rcu protection. In short, freeing of files isn't delayed until a grace period has elapsed. Instead, they are freed immediately and thus can be reused (multiple times) within the same grace period. So when picking a file from the file descriptor table via its file descriptor number it is thus possible to see an elevated reference count on file->f_count even though the file has already been recycled possibly multiple times by another task. To guard against this the vfs will pick the file from the file descriptor table twice. Once before the refcount increment and once after to compare the pointers (grossly simplified). If they match then the file is still valid. If not the caller needs to fput() it. The unconditional increment makes the following race possible as illustrated by rcuref: > Deconstruction race > =================== > > The release operation must be protected by prohibiting a grace period in > order to prevent a possible use after free: > > T1 T2 > put() get() > // ref->refcnt = ONEREF > if (!atomic_add_negative(-1, &ref->refcnt)) > return false; <- Not taken > > // ref->refcnt == NOREF > --> preemption > // Elevates ref->refcnt to ONEREF > if (!atomic_add_negative(1, &ref->refcnt)) > return true; <- taken > > if (put(&p->ref)) { <-- Succeeds > remove_pointer(p); > kfree_rcu(p, rcu); > } > > RCU grace period ends, object is freed > > atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF > > [...] it prevents the grace period which keeps the object alive until > all put() operations complete. Having files by SLAB_TYPESAFE_BY_RCU shouldn't cause any problems for this deconstruction race. Afaict, the only interesting case would be someone freeing the file and someone immediately recycling it within the same grace period and reinitializing file->f_count to ONEREF while a concurrent fput() is doing atomic_cmpxchg(&ref->refcnt, NOREF, DEAD) as in the race above. But this is safe from SLAB_TYPESAFE_BY_RCU's perspective and it should be safe from rcuref's perspective. T1 T2 T3 fput() fget() // f_count->refcnt = ONEREF if (!atomic_add_negative(-1, &f_count->refcnt)) return false; <- Not taken // f_count->refcnt == NOREF --> preemption // Elevates f_count->refcnt to ONEREF if (!atomic_add_negative(1, &f_count->refcnt)) return true; <- taken if (put(&f_count)) { <-- Succeeds remove_pointer(p); /* * Cache is SLAB_TYPESAFE_BY_RCU * so this is freed without a grace period. */ kmem_cache_free(p); } kmem_cache_alloc() init_file() { // Sets f_count->refcnt to ONEREF rcuref_long_init(&f->f_count, 1); } Object has been reused within the same grace period via kmem_cache_alloc()'s SLAB_TYPESAFE_BY_RCU. /* * With SLAB_TYPESAFE_BY_RCU this would be a safe UAF access and * it would work correctly because the atomic_cmpxchg() * will fail because the refcount has been reset to ONEREF by T3. */ atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF However, there are other cases to consider: (1) Benign race due to multiple atomic_long_read() CPU1 CPU2 file_ref_put() // last reference // => count goes negative/FILE_REF_NOREF atomic_long_add_negative_release(-1, &ref->refcnt) -> __file_ref_put() file_ref_get() // goes back from negative/FILE_REF_NOREF to 0 // and file_ref_get() succeeds atomic_long_add_negative(1, &ref->refcnt) // This is immediately followed by file_ref_put() // managing to set FILE_REF_DEAD file_ref_put() // __file_ref_put() continues and sees // cnt > FILE_REF_RELEASED // and splats with // "imbalanced put on file reference count" cnt = atomic_long_read(&ref->refcnt); The race however is benign and the problem is the atomic_long_read(). Instead of performing a separate read this uses atomic_long_dec_return() and pass the value to __file_ref_put(). Thanks to Linus for pointing out that braino. (2) SLAB_TYPESAFE_BY_RCU may cause recycled files to be marked dead When a file is recycled the following race exists: CPU1 CPU2 // @file is already dead and thus // cnt >= FILE_REF_RELEASED. file_ref_get(file) atomic_long_add_negative(1, &ref->refcnt) // We thus call into __file_ref_get() -> __file_ref_get() // which sees cnt >= FILE_REF_RELEASED cnt = atomic_long_read(&ref->refcnt); // In the meantime @file gets freed kmem_cache_free() // and is immediately recycled file = kmem_cache_zalloc() // and the reference count is reinitialized // and the file alive again in someone // else's file descriptor table file_ref_init(&ref->refcnt, 1); // the __file_ref_get() slowpath now continues // and as it saw earlier that cnt >= FILE_REF_RELEASED // it wants to ensure that we're staying in the middle // of the deadzone and unconditionally sets // FILE_REF_DEAD. // This marks @file dead for CPU2... atomic_long_set(&ref->refcnt, FILE_REF_DEAD); // Caller issues a close() system call to close @file close(fd) file = file_close_fd_locked() filp_flush() // The caller sees that cnt >= FILE_REF_RELEASED // and warns the first time... CHECK_DATA_CORRUPTION(file_count(file) == 0) // and then splats a second time because // __file_ref_put() sees cnt >= FILE_REF_RELEASED file_ref_put(&ref->refcnt); -> __file_ref_put() My initial inclination was to replace the unconditional atomic_long_set() with an atomic_long_try_cmpxchg() but Linus pointed out that: > I think we should just make file_ref_get() do a simple > > return !atomic_long_add_negative(1, &ref->refcnt)); > > and nothing else. Yes, multiple CPU's can race, and you can increment > more than once, but the gap - even on 32-bit - between DEAD and > becoming close to REF_RELEASED is so big that we simply don't care. > That's the point of having a gap. I've been testing this with will-it-scale using fstat() on a machine that Jens gave me access (thank you very much!): processor : 511 vendor_id : AuthenticAMD cpu family : 25 model : 160 model name : AMD EPYC 9754 128-Core Processor and I consistently get a 3-5% improvement on 256+ threads. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202410151043.5d224a27-oliver.sang@intel.com Closes: https://lore.kernel.org/all/202410151611.f4cd71f2-oliver.sang@intel.com Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-2-387e24dc9163@kernel.org Signed-off-by: Christian Brauner --- include/linux/file_ref.h | 177 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 include/linux/file_ref.h (limited to 'include/linux') diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h new file mode 100644 index 000000000000..9b3a8d9b17ab --- /dev/null +++ b/include/linux/file_ref.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_FILE_REF_H +#define _LINUX_FILE_REF_H + +#include +#include +#include + +/* + * file_ref is a reference count implementation specifically for use by + * files. It takes inspiration from rcuref but differs in key aspects + * such as support for SLAB_TYPESAFE_BY_RCU type caches. + * + * FILE_REF_ONEREF FILE_REF_MAXREF + * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL + * <-------------------valid -------------------> + * + * FILE_REF_SATURATED + * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL + * <-----------------------saturation zone----------------------> + * + * FILE_REF_RELEASED FILE_REF_DEAD + * 0xC000000000000000UL 0xE000000000000000UL + * <-------------------dead zone-------------------> + * + * FILE_REF_NOREF + * 0xFFFFFFFFFFFFFFFFUL + */ + +#ifdef CONFIG_64BIT +#define FILE_REF_ONEREF 0x0000000000000000UL +#define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL +#define FILE_REF_SATURATED 0xA000000000000000UL +#define FILE_REF_RELEASED 0xC000000000000000UL +#define FILE_REF_DEAD 0xE000000000000000UL +#define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL +#else +#define FILE_REF_ONEREF 0x00000000U +#define FILE_REF_MAXREF 0x7FFFFFFFU +#define FILE_REF_SATURATED 0xA0000000U +#define FILE_REF_RELEASED 0xC0000000U +#define FILE_REF_DEAD 0xE0000000U +#define FILE_REF_NOREF 0xFFFFFFFFU +#endif + +typedef struct { +#ifdef CONFIG_64BIT + atomic64_t refcnt; +#else + atomic_t refcnt; +#endif +} file_ref_t; + +/** + * file_ref_init - Initialize a file reference count + * @ref: Pointer to the reference count + * @cnt: The initial reference count typically '1' + */ +static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) +{ + atomic_long_set(&ref->refcnt, cnt - 1); +} + +bool __file_ref_put(file_ref_t *ref, unsigned long cnt); + +/** + * file_ref_get - Acquire one reference on a file + * @ref: Pointer to the reference count + * + * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. + * + * Provides full memory ordering. + * + * Return: False if the attempt to acquire a reference failed. This happens + * when the last reference has been put already. True if a reference + * was successfully acquired + */ +static __always_inline __must_check bool file_ref_get(file_ref_t *ref) +{ + /* + * Unconditionally increase the reference count with full + * ordering. The saturation and dead zones provide enough + * tolerance for this. + * + * If this indicates negative the file in question the fail can + * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. + * Hence, unconditionally altering the file reference count to + * e.g., reset the file reference count back to the middle of + * the deadzone risk end up marking someone else's file as dead + * behind their back. + * + * It would be possible to do a careful: + * + * cnt = atomic_long_inc_return(); + * if (likely(cnt >= 0)) + * return true; + * + * and then something like: + * + * if (cnt >= FILE_REF_RELEASE) + * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), + * + * to set the value back to the middle of the deadzone. But it's + * practically impossible to go from FILE_REF_DEAD to + * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 + * file_ref_get()s to resurrect such a dead file. + */ + return !atomic_long_add_negative(1, &ref->refcnt); +} + +/** + * file_ref_inc - Acquire one reference on a file + * @ref: Pointer to the reference count + * + * Acquire an additional reference on a file. Warns if the caller didn't + * already hold a reference. + */ +static __always_inline void file_ref_inc(file_ref_t *ref) +{ + long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); + WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); +} + +/** + * file_ref_put -- Release a file reference + * @ref: Pointer to the reference count + * + * Provides release memory ordering, such that prior loads and stores + * are done before, and provides an acquire ordering on success such + * that free() must come after. + * + * Return: True if this was the last reference with no future references + * possible. This signals the caller that it can safely release + * the object which is protected by the reference counter. + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * release the protected object. + */ +static __always_inline __must_check bool file_ref_put(file_ref_t *ref) +{ + long cnt; + + /* + * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() + * calls don't risk UAFs when a file is recyclyed, it is still + * vulnerable to UAFs caused by freeing the whole slab page once + * it becomes unused. Prevent file_ref_put() from being + * preempted protects against this. + */ + guard(preempt)(); + /* + * Unconditionally decrease the reference count. The saturation + * and dead zones provide enough tolerance for this. If this + * fails then we need to handle the last reference drop and + * cases inside the saturation and dead zones. + */ + cnt = atomic_long_dec_return(&ref->refcnt); + if (cnt >= 0) + return false; + return __file_ref_put(ref, cnt); +} + +/** + * file_ref_read - Read the number of file references + * @ref: Pointer to the reference count + * + * Return: The number of held references (0 ... N) + */ +static inline unsigned long file_ref_read(file_ref_t *ref) +{ + unsigned long c = atomic_long_read(&ref->refcnt); + + /* Return 0 if within the DEAD zone. */ + return c >= FILE_REF_RELEASED ? 0 : c + 1; +} + +#endif -- cgit v1.2.3 From 9a8dbdadae509e5717ff6e5aa572ca0974d2101d Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:06 +0000 Subject: block/fs: Pass an iocb to generic_atomic_write_valid() Darrick and Hannes both thought it better that generic_atomic_write_valid() should be passed a struct iocb, and not just the member of that struct which is referenced; see [0] and [1]. I think that makes a more generic and clean API, so make that change. [0] https://lore.kernel.org/linux-block/680ce641-729b-4150-b875-531a98657682@suse.de/ [1] https://lore.kernel.org/linux-xfs/20240620212401.GA3058325@frogsfrogsfrogs/ Fixes: c34fc6f26ab8 ("fs: Initial atomic write support") Suggested-by: Darrick J. Wong Suggested-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337..fbfa032d1d90 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) return !c; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos); +bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From c3be7ebbbce5201e151f17e28a6c807602f369c9 Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:07 +0000 Subject: fs/block: Check for IOCB_DIRECT in generic_atomic_write_valid() Currently FMODE_CAN_ATOMIC_WRITE is set if the bdev can atomic write and the file is open for direct IO. This does not work if the file is not opened for direct IO, yet fcntl(O_DIRECT) is used on the fd later. Change to check for direct IO on a per-IO basis in generic_atomic_write_valid(). Since we want to report -EOPNOTSUPP for non-direct IO for an atomic write, change to return an error code. Relocate the block fops atomic write checks to the common write path, as to catch non-direct IO. Fixes: c34fc6f26ab8 ("fs: Initial atomic write support") Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index fbfa032d1d90..ba47fb283730 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3721,6 +3721,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) return !c; } -bool generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From 1eadb157947163ca72ba8963b915fdc099ce6cca Mon Sep 17 00:00:00 2001 From: John Garry Date: Sat, 19 Oct 2024 12:51:08 +0000 Subject: block: Add bdev atomic write limits helpers Add helpers to get atomic write limits for a bdev, so that we don't access request_queue helpers outside the block layer. We check if the bdev can actually atomic write in these helpers, so we can avoid users missing using this check. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241019125113.369994-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50c3b959da28..c2cc3c146d74 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1674,6 +1674,22 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev) return true; } +static inline unsigned int +bdev_atomic_write_unit_min_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev)); +} + +static inline unsigned int +bdev_atomic_write_unit_max_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From e896474fe4851ffc4dd860c92daa906783090346 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 7 Oct 2024 23:08:34 -0400 Subject: getname_maybe_null() - the third variant of pathname copy-in Semantics used by statx(2) (and later *xattrat(2)): without AT_EMPTY_PATH it's standard getname() (i.e. ERR_PTR(-ENOENT) on empty string, ERR_PTR(-EFAULT) on NULL), with AT_EMPTY_PATH both empty string and NULL are accepted. Calling conventions: getname_maybe_null(user_pointer, flags) returns * pointer to struct filename when non-empty string had been successfully read * ERR_PTR(...) on error * NULL if an empty string or NULL pointer had been given with AT_EMPTY_PATH in the flags argument. It tries to avoid allocation in the last case; it's not always able to do so, in which case the temporary struct filename instance is freed and NULL returned anyway. Fast path is inlined. Signed-off-by: Al Viro --- include/linux/fs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337..403258ac2ea2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2766,6 +2766,16 @@ extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); +extern struct filename *__getname_maybe_null(const char __user *); +static inline struct filename *getname_maybe_null(const char __user *name, int flags) +{ + if (!(flags & AT_EMPTY_PATH)) + return getname(name); + + if (!name) + return NULL; + return __getname_maybe_null(name); +} extern void putname(struct filename *name); extern int finish_open(struct file *file, struct dentry *dentry, -- cgit v1.2.3 From af264e5989055ac33f413c4c80874345cda0cc97 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 18 Oct 2024 19:05:57 +0200 Subject: mtd: spinand: Constify struct nand_ecc_engine_ops 'struct nand_ecc_engine_ops' are not modified in these drivers. Constifying this structure moves some data to a read-only section, so increases overall security, especially when the structure holds some function pointers. Update the prototype of mxic_ecc_get_pipelined_ops() accordingly. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 16709 1374 16 18099 46b3 drivers/mtd/nand/ecc-mxic.o After: ===== text data bss dec hex filename 16789 1294 16 18099 46b3 drivers/mtd/nand/ecc-mxic.o Signed-off-by: Christophe JAILLET Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/72597e9de2320a4109be2112e696399592edacd4.1729271136.git.christophe.jaillet@wanadoo.fr --- include/linux/mtd/nand-ecc-mxic.h | 4 ++-- include/linux/mtd/nand.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand-ecc-mxic.h b/include/linux/mtd/nand-ecc-mxic.h index b125926e458c..0da4b2999576 100644 --- a/include/linux/mtd/nand-ecc-mxic.h +++ b/include/linux/mtd/nand-ecc-mxic.h @@ -16,7 +16,7 @@ struct mxic_ecc_engine; #if IS_ENABLED(CONFIG_MTD_NAND_ECC_MXIC) && IS_REACHABLE(CONFIG_MTD_NAND_CORE) -struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void); +const struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void); struct nand_ecc_engine *mxic_ecc_get_pipelined_engine(struct platform_device *spi_pdev); void mxic_ecc_put_pipelined_engine(struct nand_ecc_engine *eng); int mxic_ecc_process_data_pipelined(struct nand_ecc_engine *eng, @@ -24,7 +24,7 @@ int mxic_ecc_process_data_pipelined(struct nand_ecc_engine *eng, #else /* !CONFIG_MTD_NAND_ECC_MXIC */ -static inline struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void) +static inline const struct nand_ecc_engine_ops *mxic_ecc_get_pipelined_ops(void) { return NULL; } diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 1e4208040956..0e2f228e8b4a 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -293,7 +293,7 @@ enum nand_ecc_engine_integration { struct nand_ecc_engine { struct device *dev; struct list_head node; - struct nand_ecc_engine_ops *ops; + const struct nand_ecc_engine_ops *ops; enum nand_ecc_engine_integration integration; void *priv; }; -- cgit v1.2.3 From 192514ae05687a60ac230569b2a215fb2d3b8d90 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Mon, 10 Jun 2024 10:57:32 +0200 Subject: soc: mediatek: Add MediaTek DVFS Resource Collector (DVFSRC) driver The Dynamic Voltage and Frequency Scaling Resource Collector (DVFSRC) is a Hardware module used to collect all the requests from both software and the various remote processors embedded into the SoC and decide about a minimum operating voltage and a minimum DRAM frequency to fulfill those requests in an effort to provide the best achievable performance per watt. This hardware IP is capable of transparently performing direct register R/W on all of the DVFSRC-controlled regulators and SoC bandwidth knobs. This driver includes support for MT8183, MT8192 and MT8195. Co-Developed-by: Dawei Chien [Angelo: Partial refactoring and cleanups] Reviewed-by: Georgi Djakov Link: https://lore.kernel.org/r/20240610085735.147134-5-angelogioacchino.delregno@collabora.com Signed-off-by: AngeloGioacchino Del Regno --- include/linux/soc/mediatek/dvfsrc.h | 36 ++++++++++++++++++++++++++++++++ include/linux/soc/mediatek/mtk_sip_svc.h | 3 +++ 2 files changed, 39 insertions(+) create mode 100644 include/linux/soc/mediatek/dvfsrc.h (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/dvfsrc.h b/include/linux/soc/mediatek/dvfsrc.h new file mode 100644 index 000000000000..1498b3ed396b --- /dev/null +++ b/include/linux/soc/mediatek/dvfsrc.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2021 MediaTek Inc. + * Copyright (c) 2024 Collabora Ltd. + * AngeloGioacchino Del Regno + */ + +#ifndef __MEDIATEK_DVFSRC_H +#define __MEDIATEK_DVFSRC_H + +enum mtk_dvfsrc_cmd { + MTK_DVFSRC_CMD_BW, + MTK_DVFSRC_CMD_HRT_BW, + MTK_DVFSRC_CMD_PEAK_BW, + MTK_DVFSRC_CMD_OPP, + MTK_DVFSRC_CMD_VCORE_LEVEL, + MTK_DVFSRC_CMD_VSCP_LEVEL, + MTK_DVFSRC_CMD_MAX, +}; + +#if IS_ENABLED(CONFIG_MTK_DVFSRC) + +int mtk_dvfsrc_send_request(const struct device *dev, u32 cmd, u64 data); +int mtk_dvfsrc_query_info(const struct device *dev, u32 cmd, int *data); + +#else + +static inline int mtk_dvfsrc_send_request(const struct device *dev, u32 cmd, u64 data) +{ return -ENODEV; } + +static inline int mtk_dvfsrc_query_info(const struct device *dev, u32 cmd, int *data) +{ return -ENODEV; } + +#endif /* CONFIG_MTK_DVFSRC */ + +#endif diff --git a/include/linux/soc/mediatek/mtk_sip_svc.h b/include/linux/soc/mediatek/mtk_sip_svc.h index 0761128b4354..abe24a73ee19 100644 --- a/include/linux/soc/mediatek/mtk_sip_svc.h +++ b/include/linux/soc/mediatek/mtk_sip_svc.h @@ -22,6 +22,9 @@ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, MTK_SIP_SMC_CONVENTION, \ ARM_SMCCC_OWNER_SIP, fn_id) +/* DVFSRC SMC calls */ +#define MTK_SIP_DVFSRC_VCOREFS_CONTROL MTK_SIP_SMC_CMD(0x506) + /* IOMMU related SMC call */ #define MTK_SIP_KERNEL_IOMMU_CONTROL MTK_SIP_SMC_CMD(0x514) -- cgit v1.2.3 From 424a55a4a9087887fcfcee561648df497701a4a2 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 10 Oct 2024 07:40:34 +1100 Subject: uaccess: add copy_struct_to_user helper This is based on copy_struct_from_user(), but there is one additional case to consider when creating a syscall that returns an extensible-struct to userspace -- how should data in the struct that cannot fit into the userspace struct be handled (ksize > usize)? There are three possibilies: 1. The interface is like sched_getattr(2), where new information will be silently not provided to userspace. This is probably what most interfaces will want to do, as it provides the most possible backwards-compatibility. 2. The interface is like lsm_list_modules(2), where you want to return an error like -EMSGSIZE if not providing information could result in the userspace program making a serious mistake (such as one that could lead to a security problem) or if you want to provide some flag to userspace so they know that they are missing some information. 3. The interface is like statx(2), where there some kind of a request mask that indicates what data userspace would like. One could imagine that statx2(2) (using extensible structs) would want to return -EMSGSIZE if the user explicitly requested a field that their structure is too small to fit, but not return an error if the field was not explicitly requested. This is kind of a mix between (1) and (2) based on the requested mask. The copy_struct_to_user() helper includes a an extra argument that is used to return a boolean flag indicating whether there was a non-zero byte in the trailing bytes that were not copied to userspace. This can be used in the following ways to handle all three cases, respectively: 1. Just pass NULL, as you don't care about this case. 2. Return an error (say -EMSGSIZE) if the argument was set to true by copy_struct_to_user(). 3. If the argument was set to true by copy_struct_to_user(), check if there is a flag that implies a field larger than usize. This is the only case where callers of copy_struct_to_user() should check usize themselves. This will probably require scanning an array that specifies what flags were added for each version of the flags struct and returning an error if the request mask matches any of the flags that were added in versions of the struct that are larger than usize. At the moment we don't have any users of (3), so this patch doesn't include any helpers to make the necessary scanning easier, but it should be fairly easy to add some if necessary. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/r/20241010-extensible-structs-check_fields-v3-1-d2833dfe6edd@cyphar.com Signed-off-by: Christian Brauner --- include/linux/uaccess.h | 97 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 39c7cf82b0c2..1e7c68b4c262 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -394,6 +394,103 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, return 0; } +/** + * copy_struct_to_user: copy a struct to userspace + * @dst: Destination address, in userspace. This buffer must be @ksize + * bytes long. + * @usize: (Alleged) size of @dst struct. + * @src: Source address, in kernel space. + * @ksize: Size of @src struct. + * @ignored_trailing: Set to %true if there was a non-zero byte in @src that + * userspace cannot see because they are using an smaller struct. + * + * Copies a struct from kernel space to userspace, in a way that guarantees + * backwards-compatibility for struct syscall arguments (as long as future + * struct extensions are made such that all new fields are *appended* to the + * old struct, and zeroed-out new fields have the same meaning as the old + * struct). + * + * Some syscalls may wish to make sure that userspace knows about everything in + * the struct, and if there is a non-zero value that userspce doesn't know + * about, they want to return an error (such as -EMSGSIZE) or have some other + * fallback (such as adding a "you're missing some information" flag). If + * @ignored_trailing is non-%NULL, it will be set to %true if there was a + * non-zero byte that could not be copied to userspace (ie. was past @usize). + * + * While unconditionally returning an error in this case is the simplest + * solution, for maximum backward compatibility you should try to only return + * -EMSGSIZE if the user explicitly requested the data that couldn't be copied. + * Note that structure sizes can change due to header changes and simple + * recompilations without code changes(!), so if you care about + * @ignored_trailing you probably want to make sure that any new field data is + * associated with a flag. Otherwise you might assume that a program knows + * about data it does not. + * + * @ksize is just sizeof(*src), and @usize should've been passed by userspace. + * The recommended usage is something like the following: + * + * SYSCALL_DEFINE2(foobar, struct foo __user *, uarg, size_t, usize) + * { + * int err; + * bool ignored_trailing; + * struct foo karg = {}; + * + * if (usize > PAGE_SIZE) + * return -E2BIG; + * if (usize < FOO_SIZE_VER0) + * return -EINVAL; + * + * // ... modify karg somehow ... + * + * err = copy_struct_to_user(uarg, usize, &karg, sizeof(karg), + * &ignored_trailing); + * if (err) + * return err; + * if (ignored_trailing) + * return -EMSGSIZE: + * + * // ... + * } + * + * There are three cases to consider: + * * If @usize == @ksize, then it's copied verbatim. + * * If @usize < @ksize, then the kernel is trying to pass userspace a newer + * struct than it supports. Thus we only copy the interoperable portions + * (@usize) and ignore the rest (but @ignored_trailing is set to %true if + * any of the trailing (@ksize - @usize) bytes are non-zero). + * * If @usize > @ksize, then the kernel is trying to pass userspace an older + * struct than userspace supports. In order to make sure the + * unknown-to-the-kernel fields don't contain garbage values, we zero the + * trailing (@usize - @ksize) bytes. + * + * Returns (in all cases, some data may have been copied): + * * -EFAULT: access to userspace failed. + */ +static __always_inline __must_check int +copy_struct_to_user(void __user *dst, size_t usize, const void *src, + size_t ksize, bool *ignored_trailing) +{ + size_t size = min(ksize, usize); + size_t rest = max(ksize, usize) - size; + + /* Double check if ksize is larger than a known object size. */ + if (WARN_ON_ONCE(ksize > __builtin_object_size(src, 1))) + return -E2BIG; + + /* Deal with trailing bytes. */ + if (usize > ksize) { + if (clear_user(dst + size, rest)) + return -EFAULT; + } + if (ignored_trailing) + *ignored_trailing = ksize < usize && + memchr_inv(src + size, 0, rest) != NULL; + /* Copy the interoperable parts of the struct. */ + if (copy_to_user(dst, src, size)) + return -EFAULT; + return 0; +} + bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); -- cgit v1.2.3 From 3e482e2840546a3ff725d5adf8d0779ac1c3cf4f Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 18 Oct 2024 14:37:25 -0700 Subject: dmaengine: idxd: Move DSA/IAA device IDs to IDXD driver Since the DSA/IAA device IDs are only used by the IDXD driver, there is no need to define them as public IDs. Move their definitions to the IDXD driver to limit their scope. This change helps reduce unnecessary exposure of the device IDs in the global space, making the codebase cleaner and better encapsulated. There is no functional change. Signed-off-by: Fenghua Yu Link: https://lore.kernel.org/r/20241018213725.4167413-1-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- include/linux/pci_ids.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 4cf6aaed5f35..e4bddb927795 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2709,9 +2709,6 @@ #define PCI_DEVICE_ID_INTEL_82815_MC 0x1130 #define PCI_DEVICE_ID_INTEL_82815_CGC 0x1132 #define PCI_DEVICE_ID_INTEL_SST_TNG 0x119a -#define PCI_DEVICE_ID_INTEL_DSA_GNRD 0x11fb -#define PCI_DEVICE_ID_INTEL_DSA_DMR 0x1212 -#define PCI_DEVICE_ID_INTEL_IAA_DMR 0x1216 #define PCI_DEVICE_ID_INTEL_82092AA_0 0x1221 #define PCI_DEVICE_ID_INTEL_82437 0x122d #define PCI_DEVICE_ID_INTEL_82371FB_0 0x122e -- cgit v1.2.3 From 6474353a5e3d0b2cf610153cea0c61f576a36d0a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Sep 2024 11:05:16 +0200 Subject: epoll: annotate racy check Epoll relies on a racy fastpath check during __fput() in eventpoll_release() to avoid the hit of pointlessly acquiring a semaphore. Annotate that race by using WRITE_ONCE() and READ_ONCE(). Link: https://lore.kernel.org/r/66edfb3c.050a0220.3195df.001a.GAE@google.com Link: https://lore.kernel.org/r/20240925-fungieren-anbauen-79b334b00542@brauner Reviewed-by: Jan Kara Reported-by: syzbot+3b6b32dc50537a49bb4a@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/eventpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 3337745d81bd..0c0d00fcd131 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -42,7 +42,7 @@ static inline void eventpoll_release(struct file *file) * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ - if (likely(!file->f_ep)) + if (likely(!READ_ONCE(file->f_ep))) return; /* -- cgit v1.2.3 From 900bbaae67e980945dec74d36f8afe0de7556d5a Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Fri, 26 Apr 2024 16:05:48 +0800 Subject: epoll: Add synchronous wakeup support for ep_poll_callback Now, the epoll only use wake_up() interface to wake up task. However, sometimes, there are epoll users which want to use the synchronous wakeup flag to hint the scheduler, such as Android binder driver. So add a wake_up_sync() define, and use the wake_up_sync() when the sync is true in ep_poll_callback(). Co-developed-by: Jing Xia Signed-off-by: Jing Xia Signed-off-by: Xuewen Yan Link: https://lore.kernel.org/r/20240426080548.8203-1-xuewen.yan@unisoc.com Tested-by: Brian Geffon Reviewed-by: Brian Geffon Reported-by: Benoit Lize Signed-off-by: Christian Brauner --- include/linux/wait.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 8aa3372f21a0..2b322a9b88a2 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -221,6 +221,7 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) #define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) +#define wake_up_sync(x) __wake_up_sync(x, TASK_NORMAL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) -- cgit v1.2.3 From 99bdadbde9c418f29b78b7241732268dbc0a05cc Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 15 Oct 2024 22:21:54 +0200 Subject: acl: Realign struct posix_acl to save 8 bytes Reduce posix_acl's struct size by 8 bytes by realigning its members. Cc: Christian Brauner Reviewed-by: Jan Kara Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241015202158.2376-1-thorsten.blum@linux.dev Signed-off-by: Christian Brauner --- include/linux/posix_acl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 0e65b3d634d9..2d6a4badd306 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -28,8 +28,8 @@ struct posix_acl_entry { struct posix_acl { refcount_t a_refcount; - struct rcu_head a_rcu; unsigned int a_count; + struct rcu_head a_rcu; struct posix_acl_entry a_entries[]; }; -- cgit v1.2.3 From 8c6e03ffedc5463d1aa1ba89f6ceb082518a3520 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 18 Oct 2024 14:14:21 +0200 Subject: acl: Annotate struct posix_acl with __counted_by() Add the __counted_by compiler attribute to the flexible array member a_entries to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Use struct_size() to calculate the number of bytes to allocate for new and cloned acls and remove the local size variables. Change the posix_acl_alloc() function parameter count from int to unsigned int to match posix_acl's a_count data type. Add identifier names to the function definition to silence two checkpatch warnings. Reviewed-by: Jan Kara Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241018121426.155247-2-thorsten.blum@linux.dev Cc: Nathan Chancellor Signed-off-by: Christian Brauner --- include/linux/posix_acl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 2d6a4badd306..e2d47eb1a7f3 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -30,7 +30,7 @@ struct posix_acl { refcount_t a_refcount; unsigned int a_count; struct rcu_head a_rcu; - struct posix_acl_entry a_entries[]; + struct posix_acl_entry a_entries[] __counted_by(a_count); }; #define FOREACH_ACL_ENTRY(pa, acl, pe) \ @@ -62,7 +62,7 @@ posix_acl_release(struct posix_acl *acl) /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); -extern struct posix_acl *posix_acl_alloc(int, gfp_t); +extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); -- cgit v1.2.3 From c2114a0d17631497df99388905353c11f4d5f0dd Mon Sep 17 00:00:00 2001 From: Yassine Oudjana Date: Thu, 17 Oct 2024 11:51:35 +0300 Subject: pmdomain: mediatek: Add support for MT6735 Add support for SCPSYS power domains of MT6735. All non-CPU power domains are added except for MD2 (C2K modem), which is left out due to issues with powering it on. Signed-off-by: Yassine Oudjana Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20241017085136.68053-3-y.oudjana@protonmail.com Signed-off-by: Ulf Hansson --- include/linux/soc/mediatek/infracfg.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h index 6c6cccc848f4..9956e18c5ffa 100644 --- a/include/linux/soc/mediatek/infracfg.h +++ b/include/linux/soc/mediatek/infracfg.h @@ -434,6 +434,11 @@ #define MT7622_TOP_AXI_PROT_EN_WB (BIT(2) | BIT(6) | \ BIT(7) | BIT(8)) +#define MT6735_TOP_AXI_PROT_EN_CONN (BIT(2) | BIT(8)) +#define MT6735_TOP_AXI_PROT_EN_MD1 (BIT(24) | BIT(25) | \ + BIT(26) | BIT(27) | \ + BIT(28)) + #define INFRA_TOPAXI_PROTECTEN 0x0220 #define INFRA_TOPAXI_PROTECTSTA1 0x0228 #define INFRA_TOPAXI_PROTECTEN_SET 0x0260 -- cgit v1.2.3 From d811ac148f0afd2f3f7e1cd7f54de8da973ec5e3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 15 Oct 2024 15:56:35 +0200 Subject: virtchnl: fix m68k build. The kernel test robot reported a build failure on m68k in the intel driver due to the recent shapers-related changes. The mentioned arch has funny alignment properties, let's be explicit about the binary layout expectation introducing a padding field. Fixes: 608a5c05c39b ("virtchnl: support queue rate limit and quanta size configuration") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410131710.71Wt6LKO-lkp@intel.com/ Reviewed-by: Alexander Lobakin Reviewed-by: Paul Menzel Reviewed-by: Jacob Keller Link: https://patch.msgid.link/e45d1c9f17356d431b03b419f60b8b763d2ff768.1729000481.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- include/linux/avf/virtchnl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 223e433c39fe..13a11f3c09b8 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -1499,6 +1499,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_queue_chunk); struct virtchnl_quanta_cfg { u16 quanta_size; + u16 pad; struct virtchnl_queue_chunk queue_select; }; -- cgit v1.2.3 From 2c50ec98fc6cab28df35e0a22a2bcc7957d9d0ab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 2 Oct 2024 14:06:47 -0600 Subject: block: remove redundant passthrough check in blk_mq_need_time_stamp() Simply checking the rq_flags is enough to determine if accounting is being done for this request. Reviewed-by: Keith Busch Reviewed-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4fecf46ef681..59e9adf815a4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -857,12 +857,6 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib); */ static inline bool blk_mq_need_time_stamp(struct request *rq) { - /* - * passthrough io doesn't use iostat accounting, cgroup stats - * and io scheduler functionalities. - */ - if (blk_rq_is_passthrough(rq)) - return false; return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); } -- cgit v1.2.3 From 9dfd9ea93aeab57d897bb7fc7c0707f26b0b9af8 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 3 Oct 2024 00:11:43 +0200 Subject: block: introduce add_disk_fwnode() Introduce add_disk_fwnode() as a replacement of device_add_disk() that permits to pass and attach a fwnode to disk dev. This variant can be useful for eMMC that might have the partition table for the disk defined in DT. A parser can later make use of the attached fwnode to parse the related table and init the hardcoded partition for the disk. device_add_disk() is converted to a simple wrapper of add_disk_fwnode() with the fwnode entry set as NULL. Signed-off-by: Christian Marangi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241002221306.4403-4-ansuelsmth@gmail.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50c3b959da28..a6aae750b4ac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -725,6 +725,9 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) +int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + struct fwnode_handle *fwnode); int __must_check device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); static inline int __must_check add_disk(struct gendisk *disk) -- cgit v1.2.3 From 110234da18ab482f6f583d28eff26b9569bf5622 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 7 Oct 2024 08:32:35 -0700 Subject: block: enable passthrough command statistics Applications using the passthrough interfaces for IO want to continue seeing the disk stats. These requests had been fenced off from this block layer feature. While the block layer doesn't necessarily know what a passthrough command does, we do know the data size and direction, which is enough to account for the command's stats. Since tracking these has the potential to produce unexpected results, the passthrough stats are locked behind a new queue flag that needs to be enabled with the /sys/block//queue/iostats_passthrough attribute. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241007153236.2818562-1-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a6aae750b4ac..6b78a68e0bd9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -349,6 +349,9 @@ typedef unsigned int __bitwise blk_flags_t; /* I/O topology is misaligned */ #define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1)) +/* passthrough command IO accounting */ +#define BLK_FLAG_IOSTATS_PASSTHROUGH ((__force blk_flags_t)(1u << 2)) + struct queue_limits { blk_features_t features; blk_flags_t flags; @@ -617,6 +620,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) +#define blk_queue_passthrough_stat(q) \ + ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME -- cgit v1.2.3 From b21d948f4cc73e3296f2365c7afca721dd6893fa Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Thu, 29 Aug 2024 12:56:11 -0500 Subject: block: sed-opal: add ioctl IOC_OPAL_SET_SID_PW After a SED drive is provisioned, there is no way to change the SID password via the ioctl() interface. A new ioctl IOC_OPAL_SET_SID_PW will allow the password to be changed. The valid current password is required. Signed-off-by: Greg Joyce Reviewed-by: Daniel Wagner Link: https://lore.kernel.org/r/20240829175639.6478-2-gjoyce@linux.ibm.com Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 2ac50822554e..80f33a93f944 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -52,6 +52,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_GET_GEOMETRY: case IOC_OPAL_DISCOVERY: case IOC_OPAL_REVERT_LSP: + case IOC_OPAL_SET_SID_PW: return true; } return false; -- cgit v1.2.3 From 8cf8dfceebdaf282da8a836b2bb578808a12698c Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 22 Oct 2024 15:41:34 +0200 Subject: seccomp: Stub for !HAVE_ARCH_SECCOMP_FILTER If we have CONFIG_SECCOMP but not CONFIG_HAVE_ARCH_SECCOMP_FILTER we get a compilation error: ../kernel/entry/common.c: In function 'syscall_trace_enter': ../kernel/entry/common.c:55:23: error: implicit declaration of function '__secure_computing' [-Werror=implicit-function-declaration] 55 | ret = __secure_computing(NULL); | ^~~~~~~~~~~~~~~~~~ This is because generic entry calls __secure_computing() unconditionally. Provide the needed stub similar to how current ARM does this by calling secure_computing_strict() in the absence of secure_computing(). This is similar to what is done for ARM in arch/arm/kernel/ptrace.c. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20241022-seccomp-compile-error-v2-1-c9f08a4f8ebb@linaro.org Signed-off-by: Kees Cook --- include/linux/seccomp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 709ad84809e1..341980599c71 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -32,6 +32,11 @@ static inline int secure_computing(void) } #else extern void secure_computing_strict(int this_syscall); +static inline int __secure_computing(const struct seccomp_data *sd) +{ + secure_computing_strict(sd->nr); + return 0; +} #endif extern long prctl_get_seccomp(void); -- cgit v1.2.3 From cfcdf395c21eeac4543d2b8fef9d29ae9e4559e9 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Tue, 8 Oct 2024 18:07:02 +0200 Subject: regulator: core: add callback to perform runtime init Provide an initialisation callback to handle runtime parameters. The idea is similar to the regulator_init() callback, but it provides regulator specific structures, instead of just the driver specific data. As an example, this allows the driver to amend the regulator constraints based on runtime parameters if necessary. Signed-off-by: Jerome Brunet Link: https://patch.msgid.link/20241008-regulator-ignored-data-v2-2-d1251e0ee507@baylibre.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index f230a472ccd3..d2f4427504f0 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -365,6 +365,8 @@ struct regulator_desc { int (*of_parse_cb)(struct device_node *, const struct regulator_desc *, struct regulator_config *); + int (*init_cb)(struct regulator_dev *, + struct regulator_config *); int id; unsigned int continuous_voltage_range:1; unsigned n_voltages; -- cgit v1.2.3 From 602ff58ae4fe4289b0ca71cba9fb82f7de92cd64 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Tue, 8 Oct 2024 18:07:03 +0200 Subject: regulator: core: remove machine init callback from config The machine specific regulator_init() appears to be unused. It does not allow a lot of interaction with the regulator framework, since nothing from the framework is passed along (desc, config, etc ...) Machine specific init may also be done with the added init_cb() in the regulator description, so remove regulator_init(). Signed-off-by: Jerome Brunet Link: https://patch.msgid.link/20241008-regulator-ignored-data-v2-3-d1251e0ee507@baylibre.com Signed-off-by: Mark Brown --- include/linux/regulator/machine.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 0cd76d264727..d0d700ff337a 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -285,8 +285,7 @@ struct regulator_init_data { int num_consumer_supplies; struct regulator_consumer_supply *consumer_supplies; - /* optional regulator machine specific init */ - int (*regulator_init)(void *driver_data); + /* optional regulator machine specific data */ void *driver_data; /* core does not touch this */ }; -- cgit v1.2.3 From 51e32e897539663957f7a0950f66b48f8896efee Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Sat, 19 Oct 2024 14:16:00 +0300 Subject: clk: Provide devm_clk_bulk_get_all_enabled() helper Commit 265b07df758a ("clk: Provide managed helper to get and enable bulk clocks") added devm_clk_bulk_get_all_enable() function, but missed to return the number of clocks stored in the clk_bulk_data table referenced by the clks argument. Without knowing the number, it's not possible to iterate these clocks when needed, hence the argument is useless and could have been simply removed. Introduce devm_clk_bulk_get_all_enabled() variant, which is consistent with devm_clk_bulk_get_all() in terms of the returned value: > 0 if one or more clocks have been stored = 0 if there are no clocks < 0 if an error occurred Moreover, the naming is consistent with devm_clk_get_enabled(), i.e. use the past form of 'enable'. To reduce code duplication and improve patch readability, make devm_clk_bulk_get_all_enable() use the new helper, as suggested by Stephen Boyd. Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Manivannan Sadhasivam Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20241019-clk_bulk_ena_fix-v4-1-57f108f64e70@collabora.com Signed-off-by: Stephen Boyd --- include/linux/clk.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index 851a0f2cf42c..1dcee6d701e4 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -496,11 +496,13 @@ int __must_check devm_clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks); /** - * devm_clk_bulk_get_all_enable - Get and enable all clocks of the consumer (managed) + * devm_clk_bulk_get_all_enabled - Get and enable all clocks of the consumer (managed) * @dev: device for clock "consumer" * @clks: pointer to the clk_bulk_data table of consumer * - * Returns success (0) or negative errno. + * Returns a positive value for the number of clocks obtained while the + * clock references are stored in the clk_bulk_data table in @clks field. + * Returns 0 if there're none and a negative value if something failed. * * This helper function allows drivers to get all clocks of the * consumer and enables them in one operation with management. @@ -508,8 +510,8 @@ int __must_check devm_clk_bulk_get_all(struct device *dev, * is unbound. */ -int __must_check devm_clk_bulk_get_all_enable(struct device *dev, - struct clk_bulk_data **clks); +int __must_check devm_clk_bulk_get_all_enabled(struct device *dev, + struct clk_bulk_data **clks); /** * devm_clk_get - lookup and obtain a managed reference to a clock producer. @@ -1034,7 +1036,7 @@ static inline int __must_check devm_clk_bulk_get_all(struct device *dev, return 0; } -static inline int __must_check devm_clk_bulk_get_all_enable(struct device *dev, +static inline int __must_check devm_clk_bulk_get_all_enabled(struct device *dev, struct clk_bulk_data **clks) { return 0; @@ -1136,6 +1138,15 @@ static inline void clk_restore_context(void) {} #endif +/* Deprecated. Use devm_clk_bulk_get_all_enabled() */ +static inline int __must_check +devm_clk_bulk_get_all_enable(struct device *dev, struct clk_bulk_data **clks) +{ + int ret = devm_clk_bulk_get_all_enabled(dev, clks); + + return ret > 0 ? 0 : ret; +} + /* clk_prepare_enable helps cases using clk_enable in non-atomic context. */ static inline int clk_prepare_enable(struct clk *clk) { -- cgit v1.2.3 From 44059790a5cb9258ae6137387e4c39b717fd2ced Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Oct 2024 07:53:04 +0200 Subject: kfifo: don't include dma-mapping.h in kfifo.h Nothing in kfifo.h directly needs dma-mapping.h, only two macros use DMA_MAPPING_ERROR when actually instantiated. Drop the dma-mapping.h include to reduce include bloat. Add an explicity include to drivers/mailbox/omap-mailbox.c as that file uses __raw_readl and __raw_writel through a complicated include chain involving Fixes: d52b761e4b1a ("kfifo: add kfifo_dma_out_prepare_mapped()") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241023055317.313234-1-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- include/linux/kfifo.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 564868bdce89..fd743d4c4b4b 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -37,7 +37,6 @@ */ #include -#include #include #include #include -- cgit v1.2.3 From e44ef3f66c5472c2cbc6957c684d7279c26b0db1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Oct 2024 05:21:08 +0000 Subject: netpoll: remove ndo_netpoll_setup() second argument npinfo is not used in any of the ndo_netpoll_setup() methods. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241018052108.2610827-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8feaca12655e..86a0b7eb9461 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1425,8 +1425,7 @@ struct net_device_ops { __be16 proto, u16 vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); - int (*ndo_netpoll_setup)(struct net_device *dev, - struct netpoll_info *info); + int (*ndo_netpoll_setup)(struct net_device *dev); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, -- cgit v1.2.3 From 7cfc1b1fa8673fe386304194d4cc2c8fe555bbf9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Oct 2024 05:23:10 +0000 Subject: net: netdev_tx_sent_queue() small optimization Change smp_mb() imediately following a set_bit() with smp_mb__after_atomic(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241018052310.2612084-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 86a0b7eb9461..bbd30f3c5d29 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3517,7 +3517,7 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, * because in netdev_tx_completed_queue we update the dql_completed * before checking the XOFF flag. */ - smp_mb(); + smp_mb__after_atomic(); /* check again in case another CPU has just made room avail */ if (unlikely(dql_avail(&dev_queue->dql) >= 0)) -- cgit v1.2.3 From e55f45b0cda71ac2e9b4c6dee8852dc8d6f22ef0 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Wed, 23 Oct 2024 10:35:43 +0200 Subject: regulator: doc: add missing documentation for init_cb Add comment documenting introduced init_cb. This solves the following warning when building the kernel documentation: ./include/linux/regulator/driver.h:435: warning: Function parameter or struct member 'init_cb' not described in 'regulator_desc' Fixes: cfcdf395c21e ("regulator: core: add callback to perform runtime init") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20241023155120.6c4fea20@canb.auug.org.au/ Signed-off-by: Jerome Brunet Link: https://patch.msgid.link/20241023-regulator-doc-fixup-v1-1-ec018742ad73@baylibre.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index d2f4427504f0..5b66caf1695d 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -269,6 +269,11 @@ enum regulator_type { * config but it cannot store it for later usage. * Callback should return 0 on success or negative ERRNO * indicating failure. + * @init_cb: Optional callback called after the parsing of init_data. + * Allows the regulator to perform runtime init if necessary, + * such as synching the regulator and the parsed constraints. + * Callback should return 0 on success or negative ERRNO + * indicating failure. * @id: Numerical identifier for the regulator. * @ops: Regulator operations table. * @irq: Interrupt number for the regulator. -- cgit v1.2.3 From d1bc2d5cca434e7c854fd16ef021c16d74293b60 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Wed, 23 Oct 2024 10:35:44 +0200 Subject: regulator: doc: remove documentation comment for regulator_init Remove documentation comment related to regulator_init callback. This solves the following warning when building the kernel documentation: ./include/linux/regulator/machine.h:290: warning: Excess struct member 'regulator_init' description in 'regulator_init_data' Fixes: 602ff58ae4fe ("regulator: core: remove machine init callback from config") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20241023155257.0fa7211d@canb.auug.org.au/ Signed-off-by: Jerome Brunet Link: https://patch.msgid.link/20241023-regulator-doc-fixup-v1-2-ec018742ad73@baylibre.com Signed-off-by: Mark Brown --- include/linux/regulator/machine.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index d0d700ff337a..b3db09a7429b 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -273,8 +273,6 @@ struct regulator_consumer_supply { * be usable. * @num_consumer_supplies: Number of consumer device supplies. * @consumer_supplies: Consumer device supply configuration. - * - * @regulator_init: Callback invoked when the regulator has been registered. * @driver_data: Data passed to regulator_init. */ struct regulator_init_data { -- cgit v1.2.3 From 50a191a8a12b33dfad3b27c6ba4e76c5ba39db73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 10 Aug 2024 19:00:35 +0200 Subject: sysctl: update comments to new registration APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysctl registration APIs do not need a terminating table entry anymore and with commit acc154691fc7 ("sysctl: Warn on an empty procname element") even emit warnings if such a sentinel entry is supplied. While at it also remove the mention of "table->de" which was removed in commit 3fbfa98112fc ("[PATCH] sysctl: remove the proc_dir_entry member for the sysctl tables") back in 2007. Signed-off-by: Thomas Weißschuh Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- include/linux/sysctl.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 202855befa8b..40a6ac6c9713 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -90,9 +90,7 @@ int proc_do_static_key(const struct ctl_table *table, int write, void *buffer, /* * Register a set of sysctl names by calling register_sysctl - * with an initialised array of struct ctl_table's. An entry with - * NULL procname terminates the table. table->de will be - * set up by the registration and need not be initialised in advance. + * with an initialised array of struct ctl_table's. * * sysctl names can be mirrored automatically under /proc/sys. The * procname supplied controls /proc naming. @@ -133,7 +131,7 @@ static inline void *proc_sys_poll_event(struct ctl_table_poll *poll) /* A sysctl table is an array of struct ctl_table: */ struct ctl_table { - const char *procname; /* Text ID for /proc/sys, or zero */ + const char *procname; /* Text ID for /proc/sys */ void *data; int maxlen; umode_t mode; -- cgit v1.2.3 From 8dc6d81c6b2acc434b00c4585f0594739031c4e4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 22 Oct 2024 15:18:34 +0200 Subject: debugfs: add small file operations for most files As struct file_operations is really big, but (most) debugfs files only use simple_open, read, write and perhaps seek, and don't need anything else, this wastes a lot of space for NULL pointers. Add a struct debugfs_short_fops and some bookkeeping code in debugfs so that users can use that with debugfs_create_file() using _Generic to figure out which function to use. Converting mac80211 to use it where possible saves quite a bit of space: 1010127 205064 1220 1216411 128f9b net/mac80211/mac80211.ko (before) 981199 205064 1220 1187483 121e9b net/mac80211/mac80211.ko (after) ------- -28928 = ~28KiB With a marginal space cost in debugfs: 8701 550 16 9267 2433 fs/debugfs/inode.o (before) 25233 325 32 25590 63f6 fs/debugfs/file.o (before) 8914 558 16 9488 2510 fs/debugfs/inode.o (after) 25380 325 32 25737 6489 fs/debugfs/file.o (after) --------------- +360 +8 (All on x86-64) A simple spatch suggests there are more than 300 instances, not even counting the ones hidden in macros like in mac80211, that could be trivially converted, for additional savings of about 240 bytes for each. Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20241022151838.26f9925fb959.Ia80b55e934bbfc45ce0df42a3233d34b35508046@changeid Signed-off-by: Johannes Berg --- include/linux/debugfs.h | 62 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 0928a6c8ae1e..59444b495d49 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -71,9 +71,63 @@ typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); struct dentry *debugfs_lookup(const char *name, struct dentry *parent); -struct dentry *debugfs_create_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); +struct debugfs_short_fops { + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + loff_t (*llseek) (struct file *, loff_t, int); +}; + +struct dentry *debugfs_create_file_full(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops); +struct dentry *debugfs_create_file_short(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct debugfs_short_fops *fops); + +/** + * debugfs_create_file - create a file in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations or struct debugfs_short_fops that + * should be used for this file. + * + * This is the basic "create a file" function for debugfs. It allows for a + * wide range of flexibility in creating a file, or a directory (if you want + * to create a directory, the debugfs_create_dir() function is + * recommended to be used instead.) + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be + * returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + * + * If fops points to a struct debugfs_short_fops, then simple_open() will be + * used for the open, and only read/write/llseek are supported and are proxied, + * so no module reference or release are needed. + * + * NOTE: it's expected that most callers should _ignore_ the errors returned + * by this function. Other debugfs functions handle the fact that the "dentry" + * passed to them could be an error and they don't crash in that case. + * Drivers should generally work fine even if debugfs fails to init anyway. + */ +#define debugfs_create_file(name, mode, parent, data, fops) \ + _Generic(fops, \ + const struct file_operations *: debugfs_create_file_full, \ + const struct debugfs_short_fops *: debugfs_create_file_short, \ + struct file_operations *: debugfs_create_file_full, \ + struct debugfs_short_fops *: debugfs_create_file_short) \ + (name, mode, parent, data, fops) + struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); @@ -207,7 +261,7 @@ static inline struct dentry *debugfs_lookup(const char *name, static inline struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, - const struct file_operations *fops) + const void *fops) { return ERR_PTR(-ENODEV); } -- cgit v1.2.3 From 26ff1fb02991e1260481185bb5ccab1ee498d5e4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Oct 2024 09:19:29 -0700 Subject: rcu: Delete unused rcu_gp_might_be_stalled() function The rcu_gp_might_be_stalled() function is no longer used, so this commit removes it. Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Frederic Weisbecker --- include/linux/rcutiny.h | 1 - include/linux/rcutree.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0ee270b3f5ed..fe42315f667f 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -165,7 +165,6 @@ static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } static inline void kfree_rcu_scheduler_running(void) { } -static inline bool rcu_gp_might_be_stalled(void) { return false; } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 90a684f94776..27d86d912781 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -40,7 +40,6 @@ void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); void kfree_rcu_scheduler_running(void); -bool rcu_gp_might_be_stalled(void); struct rcu_gp_oldstate { unsigned long rgos_norm; -- cgit v1.2.3 From da09a9e0c3eab164af950be44ee6bdea8527c3e5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 18 Oct 2024 22:22:51 +0200 Subject: uprobe: Add data pointer to consumer handlers Adding data pointer to both entry and exit consumer handlers and all its users. The functionality itself is coming in following change. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241018202252.693462-2-jolsa@kernel.org --- include/linux/uprobes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 2b294bf1881f..bb265a632b91 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -37,10 +37,10 @@ struct uprobe_consumer { * for the current process. If filter() is omitted or returns true, * UPROBE_HANDLER_REMOVE is effectively ignored. */ - int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); + int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, - struct pt_regs *regs); + struct pt_regs *regs, __u64 *data); bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); struct list_head cons_node; -- cgit v1.2.3 From 4d756095d3994cb41393817dc696b458938a6bd0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 18 Oct 2024 22:22:52 +0200 Subject: uprobe: Add support for session consumer This change allows the uprobe consumer to behave as session which means that 'handler' and 'ret_handler' callbacks are connected in a way that allows to: - control execution of 'ret_handler' from 'handler' callback - share data between 'handler' and 'ret_handler' callbacks The session concept fits to our common use case where we do filtering on entry uprobe and based on the result we decide to run the return uprobe (or not). It's also convenient to share the data between session callbacks. To achive this we are adding new return value the uprobe consumer can return from 'handler' callback: UPROBE_HANDLER_IGNORE - Ignore 'ret_handler' callback for this consumer. And store cookie and pass it to 'ret_handler' when consumer has both 'handler' and 'ret_handler' callbacks defined. We store shared data in the return_consumer object array as part of the return_instance object. This way the handle_uretprobe_chain can find related return_consumer and its shared data. We also store entry handler return value, for cases when there are multiple consumers on single uprobe and some of them are ignored and some of them not, in which case the return probe gets installed and we need to have a way to find out which consumer needs to be ignored. The tricky part is when consumer is registered 'after' the uprobe entry handler is hit. In such case this consumer's 'ret_handler' gets executed as well, but it won't have the proper data pointer set, so we can filter it out. Suggested-by: Oleg Nesterov Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241018202252.693462-3-jolsa@kernel.org --- include/linux/uprobes.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index bb265a632b91..dbaf04189548 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -23,8 +23,17 @@ struct inode; struct notifier_block; struct page; +/* + * Allowed return values from uprobe consumer's handler callback + * with following meaning: + * + * UPROBE_HANDLER_REMOVE + * - Remove the uprobe breakpoint from current->mm. + * UPROBE_HANDLER_IGNORE + * - Ignore ret_handler callback for this consumer. + */ #define UPROBE_HANDLER_REMOVE 1 -#define UPROBE_HANDLER_MASK 1 +#define UPROBE_HANDLER_IGNORE 2 #define MAX_URETPROBE_DEPTH 64 @@ -44,6 +53,8 @@ struct uprobe_consumer { bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); struct list_head cons_node; + + __u64 id; /* set when uprobe_consumer is registered */ }; #ifdef CONFIG_UPROBES @@ -83,14 +94,22 @@ struct uprobe_task { unsigned int depth; }; +struct return_consumer { + __u64 cookie; + __u64 id; +}; + struct return_instance { struct uprobe *uprobe; unsigned long func; unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ bool chained; /* true, if instance is nested */ + int consumers_cnt; struct return_instance *next; /* keep as stack */ + + struct return_consumer consumers[] __counted_by(consumers_cnt); }; enum rp_check { -- cgit v1.2.3 From ee1251fc0c4e799a48025318f262739919deb977 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 22 Oct 2024 11:49:45 +0000 Subject: cgroup/freezer: Reduce redundant traversal for cgroup_freeze MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whether a cgroup is frozen is determined solely by whether it is set to to be frozen and whether its parent is frozen. Currently, when is cgroup is frozen or unfrozen, it iterates through the entire subtree to freeze or unfreeze its descentdants. However, this is unesessary for a cgroup that does not change its effective frozen status. This path aims to skip the subtree if its parent does not have a change in effective freeze. For an example, subtree like, a-b-c-d-e-f-g, when a is frozen, the entire tree is frozen. If we freeze b and c again, it is unesessary to iterate d, e, f and g. So does that If we unfreeze b/c. Reviewed-by: Michal Koutný Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 0a80ef9191a6..1b20d2d8ef7c 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -398,7 +398,7 @@ struct cgroup_freezer_state { bool freeze; /* Should the cgroup actually be frozen? */ - int e_freeze; + bool e_freeze; /* Fields below are protected by css_set_lock */ -- cgit v1.2.3 From 04af8a399fa40310f831b4f1dc9f757085f41983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 18 Oct 2024 17:47:48 +0300 Subject: PCI: Protect Link Control 2 Register with RMW locking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCIe Bandwidth Controller performs RMW accesses the Link Control 2 Register which can occur concurrently to other sources of Link Control 2 Register writes. Therefore, add Link Control 2 Register among the PCI Express Capability Registers that need RMW locking. Link: https://lore.kernel.org/r/20241018144755.7875-3-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Wunner Reviewed-by: Jonathan Cameron --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 573b4c4c2be6..be5ed534c39c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1274,6 +1274,7 @@ static inline int pcie_capability_clear_and_set_word(struct pci_dev *dev, { switch (pos) { case PCI_EXP_LNKCTL: + case PCI_EXP_LNKCTL2: case PCI_EXP_RTCTL: return pcie_capability_clear_and_set_word_locked(dev, pos, clear, set); -- cgit v1.2.3 From 52e0874fc16bd26e9ea1871e30ffb2c6dff187cf Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 Aug 2024 12:39:02 +0200 Subject: locking/rt: Add sparse annotation PREEMPT_RT's sleeping locks. The sleeping locks on PREEMPT_RT (rt_spin_lock() and friends) lack sparse annotation. Therefore a missing spin_unlock() won't be spotted by sparse in a PREEMPT_RT build while it is noticed on a !PREEMPT_RT build. Add the __acquires/__releases macros to the lock/ unlock functions. The trylock functions already use the __cond_lock() wrapper. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240812104200.2239232-2-bigeasy@linutronix.de --- include/linux/rwlock_rt.h | 10 +++++----- include/linux/spinlock_rt.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h index 8544ff05e594..7d81fc6918ee 100644 --- a/include/linux/rwlock_rt.h +++ b/include/linux/rwlock_rt.h @@ -24,13 +24,13 @@ do { \ __rt_rwlock_init(rwl, #rwl, &__key); \ } while (0) -extern void rt_read_lock(rwlock_t *rwlock); +extern void rt_read_lock(rwlock_t *rwlock) __acquires(rwlock); extern int rt_read_trylock(rwlock_t *rwlock); -extern void rt_read_unlock(rwlock_t *rwlock); -extern void rt_write_lock(rwlock_t *rwlock); -extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass); +extern void rt_read_unlock(rwlock_t *rwlock) __releases(rwlock); +extern void rt_write_lock(rwlock_t *rwlock) __acquires(rwlock); +extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass) __acquires(rwlock); extern int rt_write_trylock(rwlock_t *rwlock); -extern void rt_write_unlock(rwlock_t *rwlock); +extern void rt_write_unlock(rwlock_t *rwlock) __releases(rwlock); static __always_inline void read_lock(rwlock_t *rwlock) { diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index 61c49b16f69a..babc3e028779 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -32,10 +32,10 @@ do { \ __rt_spin_lock_init(slock, #slock, &__key, true); \ } while (0) -extern void rt_spin_lock(spinlock_t *lock); -extern void rt_spin_lock_nested(spinlock_t *lock, int subclass); -extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock); -extern void rt_spin_unlock(spinlock_t *lock); +extern void rt_spin_lock(spinlock_t *lock) __acquires(lock); +extern void rt_spin_lock_nested(spinlock_t *lock, int subclass) __acquires(lock); +extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock) __acquires(lock); +extern void rt_spin_unlock(spinlock_t *lock) __releases(lock); extern void rt_spin_lock_unlock(spinlock_t *lock); extern int rt_spin_trylock_bh(spinlock_t *lock); extern int rt_spin_trylock(spinlock_t *lock); -- cgit v1.2.3 From b1f01f9e54b1aaadb6740f86017e8fabdee77fe2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 Aug 2024 12:39:03 +0200 Subject: locking/rt: Remove one __cond_lock() in RT's spin_trylock_irqsave() spin_trylock_irqsave() has a __cond_lock() wrapper which points to __spin_trylock_irqsave(). The function then invokes spin_trylock() which has another __cond_lock() finally pointing to rt_spin_trylock(). The compiler has no problem to parse this but sparse does not recognise that users of spin_trylock_irqsave() acquire a conditional lock and complains. Remove one layer of __cond_lock() so that sparse recognises conditional locking. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240812104200.2239232-3-bigeasy@linutronix.de --- include/linux/spinlock_rt.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index babc3e028779..f9f14e135be7 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -132,7 +132,7 @@ static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, #define spin_trylock_irq(lock) \ __cond_lock(lock, rt_spin_trylock(lock)) -#define __spin_trylock_irqsave(lock, flags) \ +#define spin_trylock_irqsave(lock, flags) \ ({ \ int __locked; \ \ @@ -142,9 +142,6 @@ static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, __locked; \ }) -#define spin_trylock_irqsave(lock, flags) \ - __cond_lock(lock, __spin_trylock_irqsave(lock, flags)) - #define spin_is_contended(lock) (((void)(lock), 0)) static inline int spin_is_locked(spinlock_t *lock) -- cgit v1.2.3 From 9a9f7e13952b2638bc57bc9b34e6bdd106509836 Mon Sep 17 00:00:00 2001 From: Victor Shih Date: Fri, 18 Oct 2024 18:53:18 +0800 Subject: mmc: core: Support UHS-II card control and access Embed UHS-II access/control functionality into the MMC request processing flow. Signed-off-by: Jason Lai Signed-off-by: Victor Shih Message-ID: <20241018105333.4569-2-victorshihgli@gmail.com> [Ulf: A couple of cleanups and fixed sd_uhs2_power_off()] Signed-off-by: Ulf Hansson --- include/linux/mmc/core.h | 17 +++++++++++++++++ include/linux/mmc/host.h | 15 +++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index a890a71288ef..56972bd78462 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -11,6 +11,20 @@ struct mmc_data; struct mmc_request; +#define UHS2_MAX_PAYLOAD_LEN 2 +#define UHS2_MAX_RESP_LEN 20 + +struct uhs2_command { + u16 header; + u16 arg; + __be32 payload[UHS2_MAX_PAYLOAD_LEN]; + u8 payload_len; + u8 packet_len; + u8 tmode_half_duplex; + u8 uhs2_resp[UHS2_MAX_RESP_LEN]; /* UHS2 native cmd resp */ + u8 uhs2_resp_len; /* UHS2 native cmd resp len */ +}; + struct mmc_command { u32 opcode; u32 arg; @@ -97,6 +111,8 @@ struct mmc_command { struct mmc_data *data; /* data segment associated with cmd */ struct mmc_request *mrq; /* associated request */ + struct uhs2_command *uhs2_cmd; /* UHS2 command */ + /* for SDUC */ bool has_ext_addr; u8 ext_addr; @@ -158,6 +174,7 @@ struct mmc_request { const struct bio_crypt_ctx *crypto_ctx; int crypto_key_slot; #endif + struct uhs2_command uhs2_cmd; }; struct mmc_card; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 0980d06ed419..f166d6611ddb 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -127,6 +127,13 @@ struct sd_uhs2_caps { }; enum sd_uhs2_operation { + UHS2_PHY_INIT = 0, + UHS2_SET_CONFIG, + UHS2_ENABLE_INT, + UHS2_DISABLE_INT, + UHS2_ENABLE_CLK, + UHS2_DISABLE_CLK, + UHS2_CHECK_DORMANT, UHS2_SET_IOS, }; @@ -453,6 +460,8 @@ struct mmc_host { #endif #define MMC_CAP2_ALT_GPT_TEGRA (1 << 28) /* Host with eMMC that has GPT entry at a non-standard location */ + bool uhs2_sd_tran; /* UHS-II flag for SD_TRAN state */ + bool uhs2_app_cmd; /* UHS-II flag for APP command */ struct sd_uhs2_caps uhs2_caps; /* Host UHS-II capabilities */ int fixed_drv_type; /* fixed driver type for non-removable media */ @@ -714,6 +723,12 @@ static inline void mmc_debugfs_err_stats_inc(struct mmc_host *host, host->err_stats[stat] += 1; } +static inline int mmc_card_uhs2_hd_mode(struct mmc_host *host) +{ + return host->ios.timing == MMC_TIMING_UHS2_SPEED_A_HD || + host->ios.timing == MMC_TIMING_UHS2_SPEED_B_HD; +} + int mmc_sd_switch(struct mmc_card *card, bool mode, int group, u8 value, u8 *resp); int mmc_send_status(struct mmc_card *card, u32 *status); -- cgit v1.2.3 From a5a98a786e5e31e85a69d30935254e8730734706 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 14 Oct 2024 16:59:46 +0200 Subject: thermal: core: Add and use cooling device guard Add and use a special guard for cooling devices. This allows quite a few error code paths to be simplified among other things and brings in code size reduction for a good measure. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/5837621.DvuYhMxLoT@rjwysocki.net Reviewed-by: Lukasz Luba --- include/linux/thermal.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bcaa92732e14..754802478b96 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -140,6 +140,9 @@ struct thermal_cooling_device { #endif }; +DEFINE_GUARD(cooling_dev, struct thermal_cooling_device *, mutex_lock(&_T->lock), + mutex_unlock(&_T->lock)) + /* Structure to define Thermal Zone parameters */ struct thermal_zone_params { const char *governor_name; -- cgit v1.2.3 From a6021aa24f6417416d93318bbfa022ab229c33c8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 11 Oct 2024 06:18:17 +0000 Subject: ACPI: EC: make EC support compile-time conditional The embedded controller code is mainly used on x86 laptops and cannot work without PC style I/O port access. Make this a user-visible configuration option that is default enabled on x86 but otherwise disabled, and that can never be enabled unless CONFIG_HAS_IOPORT is also available. The empty stubs in internal.h help ignore the EC code in configurations that don't support it. In order to see those stubs, the sbshc code also has to include this header and drop duplicate declarations. All the direct callers of ec_read/ec_write already had an x86 dependency and now also need to depend on APCI_EC. Signed-off-by: Arnd Bergmann Acked-by: Guenter Roeck Acked-by: Hans de Goede Link: https://patch.msgid.link/20241011061948.3211423-1-arnd@kernel.org [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4d5ee84c468b..7dd24acd9ffe 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1164,8 +1164,6 @@ int acpi_subsys_suspend_noirq(struct device *dev); int acpi_subsys_suspend(struct device *dev); int acpi_subsys_freeze(struct device *dev); int acpi_subsys_poweroff(struct device *dev); -void acpi_ec_mark_gpe_for_wake(void); -void acpi_ec_set_gpe_wake_mask(u8 action); int acpi_subsys_restore_early(struct device *dev); #else static inline int acpi_subsys_prepare(struct device *dev) { return 0; } @@ -1176,6 +1174,12 @@ static inline int acpi_subsys_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_freeze(struct device *dev) { return 0; } static inline int acpi_subsys_poweroff(struct device *dev) { return 0; } static inline int acpi_subsys_restore_early(struct device *dev) { return 0; } +#endif + +#if defined(CONFIG_ACPI_EC) && defined(CONFIG_PM_SLEEP) +void acpi_ec_mark_gpe_for_wake(void); +void acpi_ec_set_gpe_wake_mask(u8 action); +#else static inline void acpi_ec_mark_gpe_for_wake(void) {} static inline void acpi_ec_set_gpe_wake_mask(u8 action) {} #endif -- cgit v1.2.3 From 1cb80d9e93f861018fabe81a69ea0ded20f5a2d0 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 23 Oct 2024 16:47:48 -0700 Subject: bpf: Support __uptr type tag in BTF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces the "__uptr" type tag to BTF. It is to define a pointer pointing to the user space memory. This patch adds BTF logic to pass the "__uptr" type tag. btf_find_kptr() is reused for the "__uptr" tag. The "__uptr" will only be supported in the map_value of the task storage map. However, btf_parse_struct_meta() also uses btf_find_kptr() but it is not interested in "__uptr". This patch adds a "field_mask" argument to btf_find_kptr() which will return BTF_FIELD_IGNORE if the caller is not interested in a “__uptr” field. btf_parse_kptr() is also reused to parse the uptr. The btf_check_and_fixup_fields() is changed to do extra checks on the uptr to ensure that its struct size is not larger than PAGE_SIZE. It is not clear how a uptr pointing to a CO-RE supported kernel struct will be used, so it is also not allowed now. Signed-off-by: Kui-Feng Lee Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0c216e71cec7..bb31bc6d0c4d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -203,6 +203,7 @@ enum btf_field_type { BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), BPF_WORKQUEUE = (1 << 10), + BPF_UPTR = (1 << 11), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -322,6 +323,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "kptr"; case BPF_KPTR_PERCPU: return "percpu_kptr"; + case BPF_UPTR: + return "uptr"; case BPF_LIST_HEAD: return "bpf_list_head"; case BPF_LIST_NODE: @@ -350,6 +353,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); @@ -379,6 +383,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); -- cgit v1.2.3 From b9a5a07aeaa2a903fb1306eb422880b2fa5f937f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:50 -0700 Subject: bpf: Add "bool swap_uptrs" arg to bpf_local_storage_update() and bpf_selem_alloc() In a later patch, the task local storage will only accept uptr from the syscall update_elem and will not accept uptr from the bpf prog. The reason is the bpf prog does not have a way to provide a valid user space address. bpf_local_storage_update() and bpf_selem_alloc() are used by both bpf prog bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE) and bpf syscall update_elem. "bool swap_uptrs" arg is added to bpf_local_storage_update() and bpf_selem_alloc() to tell if it is called by the bpf prog or by the bpf syscall. When swap_uptrs==true, it is called by the syscall. The arg is named (swap_)uptrs because the later patch will swap the uptrs between the newly allocated selem and the user space provided map_value. It will make error handling easier in case map->ops->map_update_elem() fails and the caller can decide if it needs to unpin the uptr in the user space provided map_value or the bpf_local_storage_update() has already taken the uptr ownership and will take care of unpinning it also. Only swap_uptrs==false is passed now. The logic to handle the true case will be added in a later patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index dcddb0aef7d8..0c7216c065d5 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -181,7 +181,7 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem, gfp_t gfp_flags); + bool charge_mem, bool swap_uptrs, gfp_t gfp_flags); void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, @@ -195,7 +195,7 @@ bpf_local_storage_alloc(void *owner, struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, gfp_t gfp_flags); + void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags); u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); -- cgit v1.2.3 From 5bd5bab76669b1e1551f03f5fcbc165f3fa8d269 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:51 -0700 Subject: bpf: Postpone bpf_selem_free() in bpf_selem_unlink_storage_nolock() In a later patch, bpf_selem_free() will call unpin_user_page() through bpf_obj_free_fields(). unpin_user_page() may take spin_lock. However, some bpf_selem_free() call paths have held a raw_spin_lock. Like this: raw_spin_lock_irqsave() bpf_selem_unlink_storage_nolock() bpf_selem_free() unpin_user_page() spin_lock() To avoid spinlock nested in raw_spinlock, bpf_selem_free() should be done after releasing the raw_spinlock. The "bool reuse_now" arg is replaced with "struct hlist_head *free_selem_list" in bpf_selem_unlink_storage_nolock(). The bpf_selem_unlink_storage_nolock() will append the to-be-free selem at the free_selem_list. The caller of bpf_selem_unlink_storage_nolock() will need to call the new bpf_selem_free_list(free_selem_list, reuse_now) to free the selem after releasing the raw_spinlock. Note that the selem->snode cannot be reused for linking to the free_selem_list because the selem->snode is protected by the raw_spinlock that we want to avoid holding. A new "struct hlist_node free_node;" is union-ized with the rcu_head. Only the first one successfully hlist_del_init_rcu(&selem->snode) will be able to use the free_node. After succeeding hlist_del_init_rcu(&selem->snode), the free_node and rcu_head usage is serialized such that they can share the 16 bytes in a union. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 0c7216c065d5..ab7244d8108f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -77,7 +77,13 @@ struct bpf_local_storage_elem { struct hlist_node map_node; /* Linked to bpf_local_storage_map */ struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct hlist_node free_node; /* used to postpone + * bpf_selem_free + * after raw_spin_unlock + */ + }; /* 8 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. -- cgit v1.2.3 From ba512b00e5efbf7e19cfb7fa9f66ce82669b7077 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:53 -0700 Subject: bpf: Add uptr support in the map_value of the task local storage. This patch adds uptr support in the map_value of the task local storage. struct map_value { struct user_data __uptr *uptr; }; struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, struct value_type); } datamap SEC(".maps"); A new bpf_obj_pin_uptrs() is added to pin the user page and also stores the kernel address back to the uptr for the bpf prog to use later. It currently does not support the uptr pointing to a user struct across two pages. It also excludes PageHighMem support to keep it simple. As of now, the 32bit bpf jit is missing other more crucial bpf features. For example, many important bpf features depend on bpf kfunc now but so far only one arch (x86-32) supports it which was added by me as an example when kfunc was first introduced to bpf. The uptr can only be stored to the task local storage by the syscall update_elem. Meaning the uptr will not be considered if it is provided by the bpf prog through bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE). This is enforced by only calling bpf_local_storage_update(swap_uptrs==true) in bpf_pid_task_storage_update_elem. Everywhere else will have swap_uptrs==false. This will pump down to bpf_selem_alloc(swap_uptrs==true). It is the only case that bpf_selem_alloc() will take the uptr value when updating the newly allocated selem. bpf_obj_swap_uptrs() is added to swap the uptr between the SDATA(selem)->data and the user provided map_value in "void *value". bpf_obj_swap_uptrs() makes the SDATA(selem)->data takes the ownership of the uptr and the user space provided map_value will have NULL in the uptr. The bpf_obj_unpin_uptrs() is called after map->ops->map_update_elem() returning error. If the map->ops->map_update_elem has reached a state that the local storage has taken the uptr ownership, the bpf_obj_unpin_uptrs() will be a no op because the uptr is NULL. A "__"bpf_obj_unpin_uptrs is added to make this error path unpin easier such that it does not have to check the map->record is NULL or not. BPF_F_LOCK is not supported when the map_value has uptr. This can be revisited later if there is a use case. A similar swap_uptrs idea can be considered. The final bit is to do unpin_user_page in the bpf_obj_free_fields(). The earlier patch has ensured that the bpf_obj_free_fields() has gone through the rcu gp when needed. Cc: linux-mm@kvack.org Cc: Shakeel Butt Signed-off-by: Martin KaFai Lau Acked-by: Shakeel Butt Link: https://lore.kernel.org/r/20241023234759.860539-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb31bc6d0c4d..8888689aa917 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -424,6 +424,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: break; default: WARN_ON_ONCE(1); @@ -512,6 +513,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src bpf_obj_memcpy(map->record, dst, src, map->value_size, true); } +static inline void bpf_obj_swap_uptrs(const struct btf_record *rec, void *dst, void *src) +{ + unsigned long *src_uptr, *dst_uptr; + const struct btf_field *field; + int i; + + if (!btf_record_has_field(rec, BPF_UPTR)) + return; + + for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + src_uptr = src + field->offset; + dst_uptr = dst + field->offset; + swap(*src_uptr, *dst_uptr); + } +} + static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size) { u32 curr_off = 0; -- cgit v1.2.3 From 02ac5f9d6caec96071103f7c62b5117526e47b64 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 24 Oct 2024 02:20:02 +0000 Subject: of: property: add of_graph_get_next_port() We have endpoint base functions - of_graph_get_next_endpoint() - of_graph_get_endpoint_count() - for_each_endpoint_of_node() Here, for_each_endpoint_of_node() loop finds each endpoints ports { port@0 { (1) endpoint {...}; }; port@1 { (2) endpoint {...}; }; ... }; In above case, it finds endpoint as (1) -> (2) -> ... Basically, user/driver knows which port is used for what, but not in all cases. For example on flexible/generic driver case, how many ports are used is not fixed. For example Sound Generic Card driver which is very flexible/generic and used from many venders can't know how many ports are used, and used for what, because it depends on each vender SoC and/or its used board. And more, the port can have multi endpoints. For example Generic Sound Card case, it supports many type of connection between CPU / Codec, and some of them uses multi endpoint in one port. see below. ports { (A) port@0 { (1) endpoint@0 {...}; (2) endpoint@1 {...}; }; (B) port@1 { (3) endpoint {...}; }; ... }; Generic Sound Card want to handle each connection via "port" base instead of "endpoint" base. But, it is very difficult to handle each "port" via existing for_each_endpoint_of_node(). Because getting each "port" via of_get_parent() from each "endpoint" doesn't work. For example in above case, both (1) (2) endpoint has same "port" (= A). Add "port" base functions. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87ldyeb5t9.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Rob Herring (Arm) --- include/linux/of_graph.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h index a4bea62bfa29..44518f3583a4 100644 --- a/include/linux/of_graph.h +++ b/include/linux/of_graph.h @@ -11,6 +11,7 @@ #ifndef __LINUX_OF_GRAPH_H #define __LINUX_OF_GRAPH_H +#include #include #include @@ -37,14 +38,29 @@ struct of_endpoint { for (child = of_graph_get_next_endpoint(parent, NULL); child != NULL; \ child = of_graph_get_next_endpoint(parent, child)) +/** + * for_each_of_graph_port - iterate over every port in a device or ports node + * @parent: parent device or ports node containing port + * @child: loop variable pointing to the current port node + * + * When breaking out of the loop, and continue to use the @child, you need to + * use return_ptr(@child) or no_free_ptr(@child) not to call __free() for it. + */ +#define for_each_of_graph_port(parent, child) \ + for (struct device_node *child __free(device_node) = of_graph_get_next_port(parent, NULL);\ + child != NULL; child = of_graph_get_next_port(parent, child)) + #ifdef CONFIG_OF bool of_graph_is_present(const struct device_node *node); int of_graph_parse_endpoint(const struct device_node *node, struct of_endpoint *endpoint); unsigned int of_graph_get_endpoint_count(const struct device_node *np); +unsigned int of_graph_get_port_count(struct device_node *np); struct device_node *of_graph_get_port_by_id(struct device_node *node, u32 id); struct device_node *of_graph_get_next_endpoint(const struct device_node *parent, struct device_node *previous); +struct device_node *of_graph_get_next_port(const struct device_node *parent, + struct device_node *port); struct device_node *of_graph_get_endpoint_by_regs( const struct device_node *parent, int port_reg, int reg); struct device_node *of_graph_get_remote_endpoint( @@ -73,6 +89,11 @@ static inline unsigned int of_graph_get_endpoint_count(const struct device_node return 0; } +static inline unsigned int of_graph_get_port_count(struct device_node *np) +{ + return 0; +} + static inline struct device_node *of_graph_get_port_by_id( struct device_node *node, u32 id) { @@ -86,6 +107,13 @@ static inline struct device_node *of_graph_get_next_endpoint( return NULL; } +static inline struct device_node *of_graph_get_next_port( + const struct device_node *parent, + struct device_node *previous) +{ + return NULL; +} + static inline struct device_node *of_graph_get_endpoint_by_regs( const struct device_node *parent, int port_reg, int reg) { -- cgit v1.2.3 From 58fe47d6ac7413172e1fa88324fb1b4c0eb2c0a2 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 24 Oct 2024 02:20:06 +0000 Subject: of: property: add of_graph_get_next_port_endpoint() We already have of_graph_get_next_endpoint(), but it is not intuitive to use in some case. (X) node { (Y) ports { (P0) port@0 { endpoint { remote-endpoint = ...; };}; (P10) port@1 { endpoint { remote-endpoint = ...; }; (P11) endpoint { remote-endpoint = ...; };}; (P2) port@2 { endpoint { remote-endpoint = ...; };}; }; }; For example, if I want to handle port@1's 2 endpoints (= P10, P11), I want to use like below P10 = of_graph_get_next_endpoint(port1, NULL); P11 = of_graph_get_next_endpoint(port1, P10); But 1st one will be error, because of_graph_get_next_endpoint() requested 1st parameter is "node" (X) or "ports" (Y), not but "port". Below works well, but it will get P0 P0 = of_graph_get_next_endpoint(node, NULL); P0 = of_graph_get_next_endpoint(ports, NULL); In other words, we can't handle P10/P11 directly via of_graph_get_next_endpoint(). There is another non intuitive behavior on of_graph_get_next_endpoint(). In case of if I could get P10 pointer for some way, and if I want to handle port@1 things by loop, I would like use it like below /* * "ep" is now P10, and handle port1 things here, * but we don't know how many endpoints port1 have. * * Because "ep" is non NULL now, we can use port1 * as of_graph_get_next_endpoint(port1, xxx) */ do { /* do something for port1 specific things here */ } while (ep = of_graph_get_next_endpoint(port1, ep)) But it also not worked as I expected. I expect it will be P10 -> P11 -> NULL, but it will be P10 -> P11 -> P2, because of_graph_get_next_endpoint() will fetch "endpoint" beyond the "port". It is not useful for generic driver. To handle endpoint more intuitive, create of_graph_get_next_port_endpoint() of_graph_get_next_port_endpoint(port1, NULL); // P10 of_graph_get_next_port_endpoint(port1, P10); // P11 of_graph_get_next_port_endpoint(port1, P11); // NULL Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87jzdyb5t5.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Rob Herring (Arm) --- include/linux/of_graph.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h index 44518f3583a4..a692d9d979a6 100644 --- a/include/linux/of_graph.h +++ b/include/linux/of_graph.h @@ -50,6 +50,18 @@ struct of_endpoint { for (struct device_node *child __free(device_node) = of_graph_get_next_port(parent, NULL);\ child != NULL; child = of_graph_get_next_port(parent, child)) +/** + * for_each_of_graph_port_endpoint - iterate over every endpoint in a port node + * @parent: parent port node + * @child: loop variable pointing to the current endpoint node + * + * When breaking out of the loop, and continue to use the @child, you need to + * use return_ptr(@child) or no_free_ptr(@child) not to call __free() for it. + */ +#define for_each_of_graph_port_endpoint(parent, child) \ + for (struct device_node *child __free(device_node) = of_graph_get_next_port_endpoint(parent, NULL);\ + child != NULL; child = of_graph_get_next_port_endpoint(parent, child)) + #ifdef CONFIG_OF bool of_graph_is_present(const struct device_node *node); int of_graph_parse_endpoint(const struct device_node *node, @@ -61,6 +73,8 @@ struct device_node *of_graph_get_next_endpoint(const struct device_node *parent, struct device_node *previous); struct device_node *of_graph_get_next_port(const struct device_node *parent, struct device_node *port); +struct device_node *of_graph_get_next_port_endpoint(const struct device_node *port, + struct device_node *prev); struct device_node *of_graph_get_endpoint_by_regs( const struct device_node *parent, int port_reg, int reg); struct device_node *of_graph_get_remote_endpoint( @@ -114,6 +128,13 @@ static inline struct device_node *of_graph_get_next_port( return NULL; } +static inline struct device_node *of_graph_get_next_port_endpoint( + const struct device_node *parent, + struct device_node *previous) +{ + return NULL; +} + static inline struct device_node *of_graph_get_endpoint_by_regs( const struct device_node *parent, int port_reg, int reg) { -- cgit v1.2.3 From f730fd535fc51573f982fad629f2fc6b4a0cde2f Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 19 Aug 2024 09:41:15 +0200 Subject: cleanup: Remove address space of returned pointer Guard functions in local_lock.h are defined using DEFINE_GUARD() and DEFINE_LOCK_GUARD_1() macros having lock type defined as pointer in the percpu address space. The functions, defined by these macros return value in generic address space, causing: cleanup.h:157:18: error: return from pointer to non-enclosed address space and cleanup.h:214:18: error: return from pointer to non-enclosed address space when strict percpu checks are enabled. Add explicit casts to remove address space of the returned pointer. Found by GCC's named address space checks. Fixes: e4ab322fbaaa ("cleanup: Add conditional guard support") Signed-off-by: Uros Bizjak Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240819074124.143565-1-ubizjak@gmail.com --- include/linux/cleanup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 038b2d523bf8..518bd1fd86fb 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -290,7 +290,7 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ - { return *_T; } + { return (void *)(__force unsigned long)*_T; } #define DEFINE_GUARD_COND(_name, _ext, _condlock) \ EXTEND_CLASS(_name, _ext, \ @@ -347,7 +347,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ \ static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ { \ - return _T->lock; \ + return (void *)(__force unsigned long)_T->lock; \ } -- cgit v1.2.3 From fcc22ac5baf06dd17193de44b60dbceea6461983 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Fri, 18 Oct 2024 13:38:14 +0200 Subject: cleanup: Adjust scoped_guard() macros to avoid potential warning Change scoped_guard() and scoped_cond_guard() macros to make reasoning about them easier for static analysis tools (smatch, compiler diagnostics), especially to enable them to tell if the given usage of scoped_guard() is with a conditional lock class (interruptible-locks, try-locks) or not (like simple mutex_lock()). Add compile-time error if scoped_cond_guard() is used for non-conditional lock class. Beyond easier tooling and a little shrink reported by bloat-o-meter this patch enables developer to write code like: int foo(struct my_drv *adapter) { scoped_guard(spinlock, &adapter->some_spinlock) return adapter->spinlock_protected_var; } Current scoped_guard() implementation does not support that, due to compiler complaining: error: control reaches end of non-void function [-Werror=return-type] Technical stuff about the change: scoped_guard() macro uses common idiom of using "for" statement to declare a scoped variable. Unfortunately, current logic is too hard for compiler diagnostics to be sure that there is exactly one loop step; fix that. To make any loop so trivial that there is no above warning, it must not depend on any non-const variable to tell if there are more steps. There is no obvious solution for that in C, but one could use the compound statement expression with "goto" jumping past the "loop", effectively leaving only the subscope part of the loop semantics. More impl details: one more level of macro indirection is now needed to avoid duplicating label names; I didn't spot any other place that is using the "for (...; goto label) if (0) label: break;" idiom, so it's not packed for reuse beyond scoped_guard() family, what makes actual macros code cleaner. There was also a need to introduce const true/false variable per lock class, it is used to aid compiler diagnostics reasoning about "exactly 1 step" loops (note that converting that to function would undo the whole benefit). Big thanks to Andy Shevchenko for help on this patch, both internal and public, ranging from whitespace/formatting, through commit message clarifications, general improvements, ending with presenting alternative approaches - all despite not even liking the idea. Big thanks to Dmitry Torokhov for the idea of compile-time check for scoped_cond_guard() (to use it only with conditional locsk), and general improvements for the patch. Big thanks to David Lechner for idea to cover also scoped_cond_guard(). Signed-off-by: Przemek Kitszel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Torokhov Link: https://lkml.kernel.org/r/20241018113823.171256-1-przemyslaw.kitszel@intel.com --- include/linux/cleanup.h | 52 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 518bd1fd86fb..0cc66f8d28e7 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -285,14 +285,20 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * similar to scoped_guard(), except it does fail when the lock * acquire fails. * + * Only for conditional locks. */ +#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ +static __maybe_unused const bool class_##_name##_is_conditional = _is_cond + #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ { return (void *)(__force unsigned long)*_T; } #define DEFINE_GUARD_COND(_name, _ext, _condlock) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ EXTEND_CLASS(_name, _ext, \ ({ void *_t = _T; if (_T && !(_condlock)) _t = NULL; _t; }), \ class_##_name##_t _T) \ @@ -303,17 +309,40 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ CLASS(_name, __UNIQUE_ID(guard)) #define __guard_ptr(_name) class_##_name##_lock_ptr +#define __is_cond_ptr(_name) class_##_name##_is_conditional -#define scoped_guard(_name, args...) \ - for (CLASS(_name, scope)(args), \ - *done = NULL; __guard_ptr(_name)(&scope) && !done; done = (void *)1) - -#define scoped_cond_guard(_name, _fail, args...) \ - for (CLASS(_name, scope)(args), \ - *done = NULL; !done; done = (void *)1) \ - if (!__guard_ptr(_name)(&scope)) _fail; \ - else - +/* + * Helper macro for scoped_guard(). + * + * Note that the "!__is_cond_ptr(_name)" part of the condition ensures that + * compiler would be sure that for the unconditional locks the body of the + * loop (caller-provided code glued to the else clause) could not be skipped. + * It is needed because the other part - "__guard_ptr(_name)(&scope)" - is too + * hard to deduce (even if could be proven true for unconditional locks). + */ +#define __scoped_guard(_name, _label, args...) \ + for (CLASS(_name, scope)(args); \ + __guard_ptr(_name)(&scope) || !__is_cond_ptr(_name); \ + ({ goto _label; })) \ + if (0) { \ +_label: \ + break; \ + } else + +#define scoped_guard(_name, args...) \ + __scoped_guard(_name, __UNIQUE_ID(label), args) + +#define __scoped_cond_guard(_name, _fail, _label, args...) \ + for (CLASS(_name, scope)(args); true; ({ goto _label; })) \ + if (!__guard_ptr(_name)(&scope)) { \ + BUILD_BUG_ON(!__is_cond_ptr(_name)); \ + _fail; \ +_label: \ + break; \ + } else + +#define scoped_cond_guard(_name, _fail, args...) \ + __scoped_cond_guard(_name, _fail, __UNIQUE_ID(label), args) /* * Additional helper macros for generating lock guards with types, either for * locks that don't have a native type (eg. RCU, preempt) or those that need a @@ -369,14 +398,17 @@ static inline class_##_name##_t class_##_name##_constructor(void) \ } #define DEFINE_LOCK_GUARD_1(_name, _type, _lock, _unlock, ...) \ +__DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ __DEFINE_UNLOCK_GUARD(_name, _type, _unlock, __VA_ARGS__) \ __DEFINE_LOCK_GUARD_1(_name, _type, _lock) #define DEFINE_LOCK_GUARD_0(_name, _lock, _unlock, ...) \ +__DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ __DEFINE_UNLOCK_GUARD(_name, void, _unlock, __VA_ARGS__) \ __DEFINE_LOCK_GUARD_0(_name, _lock) #define DEFINE_LOCK_GUARD_1_COND(_name, _ext, _condlock) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ EXTEND_CLASS(_name, _ext, \ ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\ if (_T->lock && !(_condlock)) _T->lock = NULL; \ -- cgit v1.2.3 From 36c2cf88808d47e926d11b98734f154fe4a9f50f Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 1 Oct 2024 17:30:18 -0500 Subject: cleanup: Add conditional guard helper Add a new if_not_guard() macro to cleanup.h for handling conditional guards such as mutext_trylock(). This is more ergonomic than scoped_guard() for most use cases. Instead of hiding the error handling statement in the macro args, it works like a normal if statement and allow the error path to be indented while the normal code flow path is not indented. And it avoid unwanted side-effect from hidden for loop in scoped_guard(). Signed-off-by: David Lechner Co-developed-by: Fabio M. De Francesco Signed-off-by: Fabio M. De Francesco Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dan Williams Link: https://lkml.kernel.org/r/20241001-cleanup-if_not_cond_guard-v1-1-7753810b0f7a@baylibre.com --- include/linux/cleanup.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 0cc66f8d28e7..e859f79b9d2d 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -273,6 +273,12 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * an anonymous instance of the (guard) class, not recommended for * conditional locks. * + * if_not_guard(name, args...) { }: + * convenience macro for conditional guards that calls the statement that + * follows only if the lock was not acquired (typically an error return). + * + * Only for conditional locks. + * * scoped_guard (name, args...) { }: * similar to CLASS(name, scope)(args), except the variable (with the * explicit name 'scope') is declard in a for-loop such that its scope is @@ -343,6 +349,15 @@ _label: \ #define scoped_cond_guard(_name, _fail, args...) \ __scoped_cond_guard(_name, _fail, __UNIQUE_ID(label), args) + +#define __if_not_guard(_name, _id, args...) \ + BUILD_BUG_ON(!__is_cond_ptr(_name)); \ + CLASS(_name, _id)(args); \ + if (!__guard_ptr(_name)(&_id)) + +#define if_not_guard(_name, args...) \ + __if_not_guard(_name, __UNIQUE_ID(guard), args) + /* * Additional helper macros for generating lock guards with types, either for * locks that don't have a native type (eg. RCU, preempt) or those that need a -- cgit v1.2.3 From ec24643bdd625971933451f22b8e33d364920f6e Mon Sep 17 00:00:00 2001 From: Vibhore Vardhan Date: Mon, 7 Oct 2024 08:08:56 +0200 Subject: firmware: ti_sci: Add system suspend and resume call Introduce system suspend call that enables the ti_sci driver to support low power mode when the user space issues a suspend to mem. The following power management operations defined in the TISCI Low Power Mode API [1] are implemented to support suspend and resume: 1) TISCI_MSG_PREPARE_SLEEP Prepare the SOC for entering into a low power mode and provide details to firmware about the state being entered. 2) TISCI_MSG_SET_IO_ISOLATION Control the IO isolation for Low Power Mode. Also, write a ti_sci_prepare_system_suspend call to be used in the driver suspend handler to allow the system to identify the low power mode being entered and if necessary, send TISCI_MSG_PREPARE_SLEEP with information about the mode being entered. Sysfw version >= 10.00.04 support LPM_DM_MANAGED capability [2], where Device Mgr firmware now manages which low power mode is chosen. Going forward, this is the default configuration supported for TI AM62 family of devices. The state chosen by the DM can be influenced by sending constraints using the new LPM constraint APIs. In case the firmware does not support LPM_DM_MANAGED mode, the mode selection logic can be extended as needed. If no suspend-to-RAM modes are supported, return without taking any action. We're using "pm_suspend_target_state" to map the kernel's target suspend state to SysFW low power mode. Make sure this is available only when CONFIG_SUSPEND is enabled. Suspend has to be split into two parts, ti_sci_suspend() will send the prepare sleep message to prepare suspend. ti_sci_suspend_noirq() sets IO isolation which needs to be done as late as possible to avoid any issues. On resume this has to be done as early as possible. [1] https://software-dl.ti.com/tisci/esd/latest/2_tisci_msgs/pm/lpm.html Co-developed-by: Dave Gerlach Signed-off-by: Dave Gerlach Signed-off-by: Georgi Vlaev Signed-off-by: Dhruva Gole Signed-off-by: Vibhore Vardhan Signed-off-by: Kevin Hilman Tested-by: Dhruva Gole Signed-off-by: Markus Schneider-Pargmann Tested-by: Roger Quadros Link: https://lore.kernel.org/r/20241007-tisci-syssuspendresume-v13-3-ed54cd659a49@baylibre.com Signed-off-by: Nishanth Menon --- include/linux/soc/ti/ti_sci_protocol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h index bd0d11af76c5..1f1871e23f76 100644 --- a/include/linux/soc/ti/ti_sci_protocol.h +++ b/include/linux/soc/ti/ti_sci_protocol.h @@ -195,6 +195,10 @@ struct ti_sci_clk_ops { u64 *current_freq); }; +/* TISCI LPM IO isolation control values */ +#define TISCI_MSG_VALUE_IO_ENABLE 1 +#define TISCI_MSG_VALUE_IO_DISABLE 0 + /** * struct ti_sci_resource_desc - Description of TI SCI resource instance range. * @start: Start index of the first resource range. -- cgit v1.2.3 From 60357991f6b9d4bd4dc442a368da3f468cfa4903 Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Mon, 7 Oct 2024 08:08:57 +0200 Subject: firmware: ti_sci: Introduce Power Management Ops Introduce power management ops supported by the TISCI Low Power Mode API [1]. 1) TISCI_MSG_LPM_WAKE_REASON Get which wake up source woke the SoC from Low Power Mode. The wake up source IDs will be common for all K3 platforms. 2) TISCI_MSG_LPM_SET_DEVICE_CONSTRAINT Set LPM constraint on behalf of a device. By setting a constraint, the device ensures that it will not be powered off or reset in the selected mode. 3) TISCI_MSG_LPM_SET_LATENCY_CONSTRAINT Set LPM resume latency constraint. By setting a constraint, the host ensures that the resume time from selected mode will be less than the constraint value. [1] https://software-dl.ti.com/tisci/esd/latest/2_tisci_msgs/pm/lpm.html Signed-off-by: Dave Gerlach [g-vlaev@ti.com: LPM_WAKE_REASON and IO_ISOLATION support] Signed-off-by: Georgi Vlaev [a-kaur@ti.com: SET_DEVICE_CONSTRAINT support] Signed-off-by: Akashdeep Kaur [vibhore@ti.com: SET_LATENCY_CONSTRAINT support] Signed-off-by: Vibhore Vardhan Signed-off-by: Kevin Hilman Reviewed-by: Akashdeep Kaur Tested-by: Dhruva Gole Signed-off-by: Markus Schneider-Pargmann Tested-by: Kevin Hilman Tested-by: Roger Quadros Acked-by: Dhruva Gole Link: https://lore.kernel.org/r/20241007-tisci-syssuspendresume-v13-4-ed54cd659a49@baylibre.com Signed-off-by: Nishanth Menon --- include/linux/soc/ti/ti_sci_protocol.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h index 1f1871e23f76..fd104b666836 100644 --- a/include/linux/soc/ti/ti_sci_protocol.h +++ b/include/linux/soc/ti/ti_sci_protocol.h @@ -199,6 +199,31 @@ struct ti_sci_clk_ops { #define TISCI_MSG_VALUE_IO_ENABLE 1 #define TISCI_MSG_VALUE_IO_DISABLE 0 +/* TISCI LPM constraint state values */ +#define TISCI_MSG_CONSTRAINT_SET 1 +#define TISCI_MSG_CONSTRAINT_CLR 0 + +/** + * struct ti_sci_pm_ops - Low Power Mode (LPM) control operations + * @lpm_wake_reason: Get the wake up source that woke the SoC from LPM + * - source: The wake up source that woke soc from LPM. + * - timestamp: Timestamp at which soc woke. + * @set_device_constraint: Set LPM constraint on behalf of a device + * - id: Device Identifier + * - state: The desired state of device constraint: set or clear. + * @set_latency_constraint: Set LPM resume latency constraint + * - latency: maximum acceptable latency to wake up from low power mode + * - state: The desired state of latency constraint: set or clear. + */ +struct ti_sci_pm_ops { + int (*lpm_wake_reason)(const struct ti_sci_handle *handle, + u32 *source, u64 *timestamp, u8 *pin, u8 *mode); + int (*set_device_constraint)(const struct ti_sci_handle *handle, + u32 id, u8 state); + int (*set_latency_constraint)(const struct ti_sci_handle *handle, + u16 latency, u8 state); +}; + /** * struct ti_sci_resource_desc - Description of TI SCI resource instance range. * @start: Start index of the first resource range. @@ -543,6 +568,7 @@ struct ti_sci_ops { struct ti_sci_core_ops core_ops; struct ti_sci_dev_ops dev_ops; struct ti_sci_clk_ops clk_ops; + struct ti_sci_pm_ops pm_ops; struct ti_sci_rm_core_ops rm_core_ops; struct ti_sci_rm_irq_ops rm_irq_ops; struct ti_sci_rm_ringacc_ops rm_ring_ops; -- cgit v1.2.3 From 037bc38b298c9a8de64f84b253c0b472228bbb10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:03 -0700 Subject: KVM: Drop KVM_ERR_PTR_BAD_PAGE and instead return NULL to indicate an error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove KVM_ERR_PTR_BAD_PAGE and instead return NULL, as "bad page" is just a leftover bit of weirdness from days of old when KVM stuffed a "bad" page into the guest instead of actually handling missing pages. See commit cea7bb21280e ("KVM: MMU: Make gfn_to_page() always safe"). Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-2-seanjc@google.com> --- include/linux/kvm_host.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 45be36e5285f..0b280e5bad0a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -153,13 +153,6 @@ static inline bool kvm_is_error_gpa(gpa_t gpa) return gpa == INVALID_GPA; } -#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) - -static inline bool is_error_page(struct page *page) -{ - return IS_ERR(page); -} - #define KVM_REQUEST_MASK GENMASK(7,0) #define KVM_REQUEST_NO_WAKEUP BIT(8) #define KVM_REQUEST_WAIT BIT(9) -- cgit v1.2.3 From 3af91068b7e10dba438f70eae94d877f20842fa1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:05 -0700 Subject: KVM: Add kvm_release_page_unused() API to put pages that KVM never consumes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an API to release an unused page, i.e. to put a page without marking it accessed or dirty. The API will be used when KVM faults-in a page but bails before installing the guest mapping (and other similar flows). Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-4-seanjc@google.com> --- include/linux/kvm_host.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0b280e5bad0a..8de9acb0b35e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1216,6 +1216,15 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable); + +static inline void kvm_release_page_unused(struct page *page) +{ + if (!page) + return; + + put_page(page); +} + void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); -- cgit v1.2.3 From 6419bc52072b928acb764766968c672d5fede802 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:13 -0700 Subject: KVM: Rename gfn_to_page_many_atomic() to kvm_prefetch_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename gfn_to_page_many_atomic() to kvm_prefetch_pages() to try and communicate its true purpose, as the "atomic" aspect is essentially a side effect of the fact that x86 uses the API while holding mmu_lock. E.g. even if mmu_lock weren't held, KVM wouldn't want to fault-in pages, as the goal is to opportunistically grab surrounding pages that have already been accessed and/or dirtied by the host, and to do so quickly. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-12-seanjc@google.com> --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8de9acb0b35e..6a3976c1a218 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1207,8 +1207,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm); void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages); +int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); -- cgit v1.2.3 From e2d2ca71ac03c748dbc44e0dd7dc1557befb1ab6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:14 -0700 Subject: KVM: Drop @atomic param from gfn=>pfn and hva=>pfn APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop @atomic from the myriad "to_pfn" APIs now that all callers pass "false", and remove a comment blurb about KVM running only the "GUP fast" part in atomic context. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-13-seanjc@google.com> --- include/linux/kvm_host.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6a3976c1a218..32e23e05a8c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1232,9 +1232,8 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); -kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool interruptible, bool *async, + bool interruptible, bool *async, bool write_fault, bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); -- cgit v1.2.3 From 6769d1bcd3509ad2d2ee04da122c465a11a165b4 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 10 Oct 2024 11:23:18 -0700 Subject: KVM: Replace "async" pointer in gfn=>pfn with "no_wait" and error code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a pfn error code to communicate that hva_to_pfn() failed because I/O was needed and disallowed, and convert @async to a constant @no_wait boolean. This will allow eliminating the @no_wait param by having callers pass in FOLL_NOWAIT along with other FOLL_* flags. Tested-by: Alex Bennée Signed-off-by: David Stevens Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-17-seanjc@google.com> --- include/linux/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 32e23e05a8c3..dc15a9a64408 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -97,6 +97,7 @@ #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4) /* * error pfns indicate that the gfn is in slot but faild to @@ -1233,7 +1234,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool *async, + bool interruptible, bool no_wait, bool write_fault, bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); -- cgit v1.2.3 From cccefb0a0d3b4f7b41b1921538087dd7031876ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:20 -0700 Subject: KVM: Drop unused "hva" pointer from __gfn_to_pfn_memslot() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop @hva from __gfn_to_pfn_memslot() now that all callers pass NULL. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-19-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dc15a9a64408..2c9eb472f059 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1235,7 +1235,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, bool interruptible, bool no_wait, - bool write_fault, bool *writable, hva_t *hva); + bool write_fault, bool *writable); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); -- cgit v1.2.3 From ef7db98e477f4b379fac3131bf94c33774c4a211 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:24 -0700 Subject: KVM: Use NULL for struct page pointer to indicate mremapped memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop yet another unnecessary magic page value from KVM, as there's zero reason to use a poisoned pointer to indicate "no page". If KVM uses a NULL page pointer, the kernel will explode just as quickly as if KVM uses a poisoned pointer. Never mind the fact that such usage would be a blatant and egregious KVM bug. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-23-seanjc@google.com> --- include/linux/kvm_host.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c9eb472f059..cd6f5cc1930f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -273,16 +273,12 @@ enum { READING_SHADOW_PAGE_TABLES, }; -#define KVM_UNMAPPED_PAGE ((void *) 0x500 + POISON_POINTER_DELTA) - struct kvm_host_map { /* * Only valid if the 'pfn' is managed by the host kernel (i.e. There is * a 'struct page' for it. When using mem= kernel parameter some memory * can be used as guest memory but they are not managed by host * kernel). - * If 'pfn' is not managed by the host kernel, this field is - * initialized to KVM_UNMAPPED_PAGE. */ struct page *page; void *hva; -- cgit v1.2.3 From 2ff072ba7ad2deb1c3b2d231faa0ac3a25e2451b Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 10 Oct 2024 11:23:32 -0700 Subject: KVM: Migrate kvm_vcpu_map() to kvm_follow_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate kvm_vcpu_map() to kvm_follow_pfn(), and have it track whether or not the map holds a refcounted struct page. Precisely tracking struct page references will eventually allow removing kvm_pfn_to_refcounted_page() and its various wrappers. Signed-off-by: David Stevens [sean: use a pointer instead of a boolean] Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-31-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cd6f5cc1930f..35e1beb017dd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -280,6 +280,7 @@ struct kvm_host_map { * can be used as guest memory but they are not managed by host * kernel). */ + struct page *refcounted_page; struct page *page; void *hva; kvm_pfn_t pfn; @@ -1238,7 +1239,6 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); -- cgit v1.2.3 From 2bcb52a3602bf4cbc55d8fb4da00c930f83d7789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:33 -0700 Subject: KVM: Pin (as in FOLL_PIN) pages during kvm_vcpu_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pin, as in FOLL_PIN, pages when mapping them for direct access by KVM. As per Documentation/core-api/pin_user_pages.rst, writing to a page that was gotten via FOLL_GET is explicitly disallowed. Correct (uses FOLL_PIN calls): pin_user_pages() write to the data within the pages unpin_user_pages() INCORRECT (uses FOLL_GET calls): get_user_pages() write to the data within the pages put_page() Unfortunately, FOLL_PIN is a "private" flag, and so kvm_follow_pfn must use a one-off bool instead of being able to piggyback the "flags" field. Link: https://lwn.net/Articles/930667 Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-32-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 35e1beb017dd..b4c541fa5a1f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -280,7 +280,7 @@ struct kvm_host_map { * can be used as guest memory but they are not managed by host * kernel). */ - struct page *refcounted_page; + struct page *pinned_page; struct page *page; void *hva; kvm_pfn_t pfn; -- cgit v1.2.3 From 365e319208442a0807a96e9ea4d0b1fa338f1929 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:35 -0700 Subject: KVM: Pass in write/dirty to kvm_vcpu_map(), not kvm_vcpu_unmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that all kvm_vcpu_{,un}map() users pass "true" for @dirty, have them pass "true" as a @writable param to kvm_vcpu_map(), and thus create a read-only mapping when possible. Note, creating read-only mappings can be theoretically slower, as they don't play nice with fast GUP due to the need to break CoW before mapping the underlying PFN. But practically speaking, creating a mapping isn't a super hot path, and getting a writable mapping for reading is weird and confusing. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-34-seanjc@google.com> --- include/linux/kvm_host.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b4c541fa5a1f..101dbf2be1ce 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -285,6 +285,7 @@ struct kvm_host_map { void *hva; kvm_pfn_t pfn; kvm_pfn_t gfn; + bool writable; }; /* @@ -1311,8 +1312,23 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); + +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, + bool writable); +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); + +static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, + struct kvm_host_map *map) +{ + return __kvm_vcpu_map(vcpu, gpa, map, true); +} + +static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, + struct kvm_host_map *map) +{ + return __kvm_vcpu_map(vcpu, gpa, map, false); +} + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, -- cgit v1.2.3 From 21dd877060d49f1f57901f929189653fc42ac37a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:44 -0700 Subject: KVM: Move declarations of memslot accessors up in kvm_host.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the memslot lookup helpers further up in kvm_host.h so that they can be used by inlined "to pfn" wrappers. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-43-seanjc@google.com> --- include/linux/kvm_host.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 101dbf2be1ce..8d35a36e7707 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1168,6 +1168,10 @@ static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_ kvm_memslot_iter_is_valid(iter, end); \ kvm_memslot_iter_next(iter)) +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); +struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); + /* * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: * - create a new memory slot @@ -1303,16 +1307,12 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, }) int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); -struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); -struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); - int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, bool writable); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); -- cgit v1.2.3 From 1c7b627e930624dd64ee906df554c8f2bad628ff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:45 -0700 Subject: KVM: Add kvm_faultin_pfn() to specifically service guest page faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new dedicated API, kvm_faultin_pfn(), for servicing guest page faults, i.e. for getting pages/pfns that will be mapped into the guest via an mmu_notifier-protected KVM MMU. Keep struct kvm_follow_pfn buried in internal code, as having __kvm_faultin_pfn() take "out" params is actually cleaner for several architectures, e.g. it allows the caller to have its own "page fault" structure without having to marshal data to/from kvm_follow_pfn. Long term, common KVM would ideally provide a kvm_page_fault structure, a la x86's struct of the same name. But all architectures need to be converted to a common API before that can happen. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-44-seanjc@google.com> --- include/linux/kvm_host.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8d35a36e7707..a63b0325d3e2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1231,6 +1231,18 @@ static inline void kvm_release_page_unused(struct page *page) void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); +kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, + unsigned int foll, bool *writable, + struct page **refcounted_page); + +static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, + bool write, bool *writable, + struct page **refcounted_page) +{ + return __kvm_faultin_pfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, + write ? FOLL_WRITE : 0, writable, refcounted_page); +} + kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); -- cgit v1.2.3 From 1fbee5b01a0fd27db571eed757682a7c20045107 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:48 -0700 Subject: KVM: guest_memfd: Provide "struct page" as output from kvm_gmem_get_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide the "struct page" associated with a guest_memfd pfn as an output from __kvm_gmem_get_pfn() so that KVM guest page fault handlers can directly put the page instead of having to rely on kvm_pfn_to_refcounted_page(). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-47-seanjc@google.com> --- include/linux/kvm_host.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a63b0325d3e2..6efdc00b4254 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2487,11 +2487,13 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order); + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t *pfn, int *max_order) + kvm_pfn_t *pfn, struct page **page, + int *max_order) { KVM_BUG_ON(1, kvm); return -EIO; -- cgit v1.2.3 From dc06193532af4ba88ed20daeef88f22b053ebb91 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:51 -0700 Subject: KVM: Move x86's API to release a faultin page to common KVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move KVM x86's helper that "finishes" the faultin process to common KVM so that the logic can be shared across all architectures. Note, not all architectures implement a fast page fault path, but the gist of the comment applies to all architectures. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-50-seanjc@google.com> --- include/linux/kvm_host.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6efdc00b4254..3e06393e5f1e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1231,6 +1231,32 @@ static inline void kvm_release_page_unused(struct page *page) void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); +static inline void kvm_release_faultin_page(struct kvm *kvm, struct page *page, + bool unused, bool dirty) +{ + lockdep_assert_once(lockdep_is_held(&kvm->mmu_lock) || unused); + + if (!page) + return; + + /* + * If the page that KVM got from the *primary MMU* is writable, and KVM + * installed or reused a SPTE, mark the page/folio dirty. Note, this + * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if + * the GFN is write-protected. Folios can't be safely marked dirty + * outside of mmu_lock as doing so could race with writeback on the + * folio. As a result, KVM can't mark folios dirty in the fast page + * fault handler, and so KVM must (somewhat) speculatively mark the + * folio dirty if KVM could locklessly make the SPTE writable. + */ + if (unused) + kvm_release_page_unused(page); + else if (dirty) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); +} + kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page); -- cgit v1.2.3 From f42e289a2095f61755e6ca5fd1370d441bf589d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:18 -0700 Subject: KVM: Add support for read-only usage of gfn_to_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework gfn_to_page() to support read-only accesses so that it can be used by arm64 to get MTE tags out of guest memory. Opportunistically rewrite the comment to be even more stern about using gfn_to_page(), as there are very few scenarios where requiring a struct page is actually the right thing to do (though there are such scenarios). Add a FIXME to call out that KVM probably should be pinning pages, not just getting pages. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-77-seanjc@google.com> --- include/linux/kvm_host.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3e06393e5f1e..96cf9e5660c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1213,7 +1213,12 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages); -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write); +static inline struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_page(kvm, gfn, true); +} + unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); -- cgit v1.2.3 From 06cdaff80e50e3fb74e5e3101e1d5d7aa8b68da6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:21 -0700 Subject: KVM: Drop gfn_to_pfn() APIs now that all users are gone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop gfn_to_pfn() and all its variants now that all users are gone. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-80-seanjc@google.com> --- include/linux/kvm_host.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 96cf9e5660c3..4a1eaa40a215 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1274,14 +1274,6 @@ static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } -kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); -kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable); -kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool no_wait, - bool write_fault, bool *writable); - void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); -- cgit v1.2.3 From 93b7da404f5b0b02a4211bbb784889f001d27953 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:26 -0700 Subject: KVM: Drop APIs that manipulate "struct page" via pfns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove all kvm_{release,set}_pfn_*() APIs now that all users are gone. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-85-seanjc@google.com> --- include/linux/kvm_host.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4a1eaa40a215..d045f8310a48 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1274,11 +1274,6 @@ static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } -void kvm_release_pfn_clean(kvm_pfn_t pfn); -void kvm_release_pfn_dirty(kvm_pfn_t pfn); -void kvm_set_pfn_dirty(kvm_pfn_t pfn); -void kvm_set_pfn_accessed(kvm_pfn_t pfn); - int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); -- cgit v1.2.3 From 8b15c3764c05ed8766709711d2054d96349dee8e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:27 -0700 Subject: KVM: Don't grab reference on VM_MIXEDMAP pfns that have a "struct page" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that KVM no longer relies on an ugly heuristic to find its struct page references, i.e. now that KVM can't get false positives on VM_MIXEDMAP pfns, remove KVM's hack to elevate the refcount for pfns that happen to have a valid struct page. In addition to removing a long-standing wart in KVM, this allows KVM to map non-refcounted struct page memory into the guest, e.g. for exposing GPU TTM buffers to KVM guests. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-86-seanjc@google.com> --- include/linux/kvm_host.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d045f8310a48..02f0206fd2dc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1730,9 +1730,6 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn); -bool kvm_is_zone_device_page(struct page *page); - struct kvm_irq_ack_notifier { struct hlist_node link; unsigned gsi; -- cgit v1.2.3 From 6860d28ccb2390b4eeda32ab2ce7eb10f71921e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Oct 2024 12:08:39 +0200 Subject: timekeeping: Reorder struct timekeeper struct timekeeper is ordered suboptimal vs. cachelines. The layout, including the preceding seqcount (see struct tk_core in timekeeper.c) is: cacheline 0: seqcount, tkr_mono cacheline 1: tkr_raw, xtime_sec cacheline 2: ktime_sec ... tai_offset, internal variables cacheline 3: next_leap_ktime, raw_sec, internal variables cacheline 4: internal variables So any access to via ktime_get*() except for access to CLOCK_MONOTONIC_RAW will use either cachelines 0 + 1 or cachelines 0 + 2. Access to CLOCK_MONOTONIC_RAW uses cachelines 0 + 1 + 3. Reorder the members so that the result is more efficient: cacheline 0: seqcount, tkr_mono cacheline 1: xtime_sec, ktime_sec ... tai_offset cacheline 2: tkr_raw, raw_sec cacheline 3: internal variables cacheline 4: internal variables That means ktime_get*() will access cacheline 0 + 1 and CLOCK_MONOTONIC_RAW access will use cachelines 0 + 2. Update kernel-doc and fix formatting issues while at it. Also fix a typo in struct tk_read_base kernel-doc. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241015100839.12702-1-anna-maria@linutronix.de --- include/linux/timekeeper_internal.h | 106 ++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 902c20ef495a..a3b6380a7777 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -26,7 +26,7 @@ * occupies a single 64byte cache line. * * The struct is separate from struct timekeeper as it is also used - * for a fast NMI safe accessors. + * for the fast NMI safe accessors. * * @base_real is for the fast NMI safe accessor to allow reading clock * realtime from any context. @@ -44,33 +44,41 @@ struct tk_read_base { /** * struct timekeeper - Structure holding internal timekeeping values. - * @tkr_mono: The readout base structure for CLOCK_MONOTONIC - * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW - * @xtime_sec: Current CLOCK_REALTIME time in seconds - * @ktime_sec: Current CLOCK_MONOTONIC time in seconds - * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset - * @offs_real: Offset clock monotonic -> clock realtime - * @offs_boot: Offset clock monotonic -> clock boottime - * @offs_tai: Offset clock monotonic -> clock tai - * @tai_offset: The current UTC to TAI offset in seconds - * @clock_was_set_seq: The sequence number of clock was set events - * @cs_was_changed_seq: The sequence number of clocksource change events - * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second - * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds - * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset - * @cycle_interval: Number of clock cycles in one NTP interval - * @xtime_interval: Number of clock shifted nano seconds in one NTP - * interval. - * @xtime_remainder: Shifted nano seconds left over when rounding - * @cycle_interval - * @raw_interval: Shifted raw nano seconds accumulated per NTP interval. - * @ntp_error: Difference between accumulated time and NTP time in ntp - * shifted nano seconds. - * @ntp_error_shift: Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. - * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) - * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) - * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) + * @tkr_mono: The readout base structure for CLOCK_MONOTONIC + * @xtime_sec: Current CLOCK_REALTIME time in seconds + * @ktime_sec: Current CLOCK_MONOTONIC time in seconds + * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset + * @offs_real: Offset clock monotonic -> clock realtime + * @offs_boot: Offset clock monotonic -> clock boottime + * @offs_tai: Offset clock monotonic -> clock tai + * @tai_offset: The current UTC to TAI offset in seconds + * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW + * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events + * @monotonic_to_boot: CLOCK_MONOTONIC to CLOCK_BOOTTIME offset + * @cycle_interval: Number of clock cycles in one NTP interval + * @xtime_interval: Number of clock shifted nano seconds in one NTP + * interval. + * @xtime_remainder: Shifted nano seconds left over when rounding + * @cycle_interval + * @raw_interval: Shifted raw nano seconds accumulated per NTP interval. + * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second + * @ntp_tick: The ntp_tick_length() value currently being + * used. This cached copy ensures we consistently + * apply the tick length for an entire tick, as + * ntp_tick_length may change mid-tick, and we don't + * want to apply that new value to the tick in + * progress. + * @ntp_error: Difference between accumulated time and NTP time in ntp + * shifted nano seconds. + * @ntp_error_shift: Shift conversion between clock shifted nano seconds and + * ntp shifted nano seconds. + * @ntp_err_mult: Multiplication factor for scaled math conversion + * @skip_second_overflow: Flag used to avoid updating NTP twice with same second + * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) + * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) + * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffy times) @@ -88,10 +96,28 @@ struct tk_read_base { * * @monotonic_to_boottime is a timespec64 representation of @offs_boot to * accelerate the VDSO update for CLOCK_BOOTTIME. + * + * The cacheline ordering of the structure is optimized for in kernel usage of + * the ktime_get() and ktime_get_ts64() family of time accessors. Struct + * timekeeper is prepended in the core timekeeping code with a sequence count, + * which results in the following cacheline layout: + * + * 0: seqcount, tkr_mono + * 1: xtime_sec ... tai_offset + * 2: tkr_raw, raw_sec + * 3,4: Internal variables + * + * Cacheline 0,1 contain the data which is used for accessing + * CLOCK_MONOTONIC/REALTIME/BOOTTIME/TAI, while cacheline 2 contains the + * data for accessing CLOCK_MONOTONIC_RAW. Cacheline 3,4 are internal + * variables which are only accessed during timekeeper updates once per + * tick. */ struct timekeeper { + /* Cacheline 0 (together with prepended seqcount of timekeeper core): */ struct tk_read_base tkr_mono; - struct tk_read_base tkr_raw; + + /* Cacheline 1: */ u64 xtime_sec; unsigned long ktime_sec; struct timespec64 wall_to_monotonic; @@ -99,31 +125,29 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; s32 tai_offset; + + /* Cacheline 2: */ + struct tk_read_base tkr_raw; + u64 raw_sec; + + /* Cachline 3 and 4 (timekeeping internal variables): */ unsigned int clock_was_set_seq; u8 cs_was_changed_seq; - ktime_t next_leap_ktime; - u64 raw_sec; + struct timespec64 monotonic_to_boot; - /* The following members are for timekeeping internal use */ u64 cycle_interval; u64 xtime_interval; s64 xtime_remainder; u64 raw_interval; - /* The ntp_tick_length() value currently being used. - * This cached copy ensures we consistently apply the tick - * length for an entire tick, as ntp_tick_length may change - * mid-tick, and we don't want to apply that new value to - * the tick in progress. - */ + + ktime_t next_leap_ktime; u64 ntp_tick; - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ s64 ntp_error; u32 ntp_error_shift; u32 ntp_err_mult; - /* Flag used to avoid updating NTP twice with same second */ u32 skip_second_overflow; + #ifdef CONFIG_DEBUG_TIMEKEEPING long last_warning; /* -- cgit v1.2.3 From 92b043fd995a63a57aae29ff85a39b6f30cd440c Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 25 Oct 2024 13:01:41 +0200 Subject: time: Fix references to _msecs_to_jiffies() handling of values The details about the handling of the "normal" values were moved to the _msecs_to_jiffies() helpers in commit ca42aaf0c861 ("time: Refactor msecs_to_jiffies"). However, the same commit still mentioned __msecs_to_jiffies() in the added documentation. Thus point to _msecs_to_jiffies() instead. Fixes: ca42aaf0c861 ("time: Refactor msecs_to_jiffies") Signed-off-by: Miguel Ojeda Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241025110141.157205-2-ojeda@kernel.org --- include/linux/jiffies.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 1220f0fbe5bf..5d21dacd62bc 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -502,7 +502,7 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. - * for the details see __msecs_to_jiffies() + * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the -- cgit v1.2.3 From 8acdd0e7bfadda6b5103f2960d293581954454ed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Oct 2024 08:37:18 +0800 Subject: blk-mq: add non_owner variant of start_freeze/unfreeze queue APIs Add non_owner variant of start_freeze/unfreeze queue APIs, so that the caller knows that what they are doing, and we can skip lockdep support for non_owner variant in per-call level. Prepare for supporting lockdep for freezing/unfreezing queue. Reviewed-by: Christoph Hellwig Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241025003722.3630252-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 59e9adf815a4..2035fad3131f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -919,6 +919,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); +void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); +void blk_freeze_queue_start_non_owner(struct request_queue *q); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- cgit v1.2.3 From f1be1788a32e8fa63416ad4518bbd1a85a825c9d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Oct 2024 08:37:20 +0800 Subject: block: model freeze & enter queue as lock for supporting lockdep Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue and blk_enter_queue(). Turns out the two are just like acquiring read/write lock, so model them as read/write lock for supporting lockdep: 1) model q->q_usage_counter as two locks(io and queue lock) - queue lock covers sync with blk_enter_queue() - io lock covers sync with bio_enter_queue() 2) make the lockdep class/key as per-queue: - different subsystem has very different lock use pattern, shared lock class causes false positive easily - freeze_queue degrades to no lock in case that disk state becomes DEAD because bio_enter_queue() won't be blocked any more - freeze_queue degrades to no lock in case that request queue becomes dying because blk_enter_queue() won't be blocked any more 3) model blk_mq_freeze_queue() as acquire_exclusive & try_lock - it is exclusive lock, so dependency with blk_enter_queue() is covered - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently 4) model blk_enter_queue() & bio_enter_queue() as acquire_read() - nested blk_enter_queue() are allowed - dependency with blk_mq_freeze_queue() is covered - blk_queue_exit() is often called from other contexts(such as irq), and it can't be annotated as lock_release(), so simply do it in blk_enter_queue(), this way still covered cases as many as possible With lockdep support, such kind of reports may be reported asap and needn't wait until the real deadlock is triggered. For example, lockdep report can be triggered in the report[3] with this patch applied. [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' https://bugzilla.kernel.org/show_bug.cgi?id=219166 [2] del_gendisk() vs blk_queue_enter() race condition https://lore.kernel.org/linux-block/20241003085610.GK11458@google.com/ [3] queue_freeze & queue_enter deadlock in scsi https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241025003722.3630252-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 55bec14fe55f..d0a52ed05e60 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -25,6 +25,7 @@ #include #include #include +#include struct module; struct request_queue; @@ -474,6 +475,11 @@ struct request_queue { struct xarray hctx_table; struct percpu_ref q_usage_counter; + struct lock_class_key io_lock_cls_key; + struct lockdep_map io_lockdep_map; + + struct lock_class_key q_lock_cls_key; + struct lockdep_map q_lockdep_map; struct request *last_merge; -- cgit v1.2.3 From 30dac24e14b52e1787572d1d4e06eeabe8a63630 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 26 Sep 2024 16:01:21 +0200 Subject: fs/writeback: convert wbc_account_cgroup_owner to take a folio Most of the callers of wbc_account_cgroup_owner() are converting a folio to page before calling the function. wbc_account_cgroup_owner() is converting the page back to a folio to call mem_cgroup_css_from_folio(). Convert wbc_account_cgroup_owner() to take a folio instead of a page, and convert all callers to pass a folio directly except f2fs. Convert the page to folio for all the callers from f2fs as they were the only callers calling wbc_account_cgroup_owner() with a page. As f2fs is already in the process of converting to folios, these call sites might also soon be calling wbc_account_cgroup_owner() with a folio directly in the future. No functional changes. Only compile tested. Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240926140121.203821-1-kernel@pankajraghav.com Acked-by: David Sterba Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- include/linux/writeback.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d6db822e4bb3..641a057e0413 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -217,7 +217,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); @@ -324,7 +324,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) } static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, - struct page *page, size_t bytes) + struct folio *folio, size_t bytes) { } -- cgit v1.2.3 From 0e152beb5aa1ccdac9aae9fa570a9e039aff7a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 21 Oct 2024 13:37:17 -0300 Subject: libfs: Create the helper function generic_ci_validate_strict_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a helper function for filesystems do the checks required for casefold directories and strict encoding. Suggested-by: Gabriel Krisman Bertazi Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: André Almeida Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-1-f443d5814194@igalia.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3559446279c1..403ee5d54c60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -3456,6 +3457,50 @@ extern int generic_ci_match(const struct inode *parent, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len); +#if IS_ENABLED(CONFIG_UNICODE) +/** + * generic_ci_validate_strict_name - Check if a given name is suitable + * for a directory + * + * This functions checks if the proposed filename is valid for the + * parent directory. That means that only valid UTF-8 filenames will be + * accepted for casefold directories from filesystems created with the + * strict encoding flag. That also means that any name will be + * accepted for directories that doesn't have casefold enabled, or + * aren't being strict with the encoding. + * + * @dir: inode of the directory where the new file will be created + * @name: name of the new file + * + * Return: + * * True if the filename is suitable for this directory. It can be + * true if a given name is not suitable for a strict encoding + * directory, but the directory being used isn't strict + * * False if the filename isn't suitable for this directory. This only + * happens when a directory is casefolded and the filesystem is strict + * about its encoding. + */ +static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) +{ + if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) + return true; + + /* + * A casefold dir must have a encoding set, unless the filesystem + * is corrupted + */ + if (WARN_ON_ONCE(!dir->i_sb->s_encoding)) + return true; + + return !utf8_validate(dir->i_sb->s_encoding, name); +} +#else +static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) +{ + return true; +} +#endif + static inline bool sb_has_encoding(const struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) -- cgit v1.2.3 From 04dad6c6d37d741bad9946a92171bfa637e989f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 21 Oct 2024 13:37:19 -0300 Subject: unicode: Export latest available UTF-8 version number MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export latest available UTF-8 version number so filesystems can easily load the newest one. Signed-off-by: André Almeida Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-3-f443d5814194@igalia.com Acked-by: Gabriel Krisman Bertazi Signed-off-by: Christian Brauner --- include/linux/unicode.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 4d39e6e11a95..0c0ab04e84ee 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -16,6 +16,8 @@ struct utf8data_table; ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ ((unsigned int)(REV))) +#define UTF8_LATEST UNICODE_AGE(12, 1, 0) + static inline u8 unicode_major(unsigned int age) { return (age >> UNICODE_MAJ_SHIFT) & 0xff; -- cgit v1.2.3 From 142fa60f61f93805471012f24e029af6d113c5cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 21 Oct 2024 13:37:20 -0300 Subject: unicode: Recreate utf8_parse_version() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All filesystems that currently support UTF-8 casefold can fetch the UTF-8 version from the filesystem metadata stored on disk. They can get the data stored and directly match it to a integer, so they can skip the string parsing step, which motivated the removal of this function in the first place. However, for tmpfs, the only way to tell the kernel which UTF-8 version we are about to use is via mount options, using a string. Re-introduce utf8_parse_version() to be used by tmpfs. This version differs from the original by skipping the intermediate step of copying the version string to an auxiliary string before calling match_token(). This versions calls match_token() in the argument string. The paramenters are simpler now as well. utf8_parse_version() was created by 9d53690f0d4 ("unicode: implement higher level API for string handling") and later removed by 49bd03cc7e9 ("unicode: pass a UNICODE_AGE() tripple to utf8_load"). Signed-off-by: André Almeida Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-4-f443d5814194@igalia.com Reviewed-by: Theodore Ts'o Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Christian Brauner --- include/linux/unicode.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 0c0ab04e84ee..5e6b212a2aed 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -78,4 +78,6 @@ int utf8_casefold_hash(const struct unicode_map *um, const void *salt, struct unicode_map *utf8_load(unsigned int version); void utf8_unload(struct unicode_map *um); +int utf8_parse_version(char *version); + #endif /* _LINUX_UNICODE_H */ -- cgit v1.2.3 From 458532c8dfeb24edd5e07467605a6484a728e5c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 21 Oct 2024 13:37:21 -0300 Subject: libfs: Export generic_ci_ dentry functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export generic_ci_ dentry functions so they can be used by case-insensitive filesystems that need something more custom than the default one set by `struct generic_ci_dentry_ops`. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: André Almeida Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-5-f443d5814194@igalia.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 403ee5d54c60..b277369672a1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3458,6 +3458,10 @@ extern int generic_ci_match(const struct inode *parent, const u8 *de_name, u32 de_name_len); #if IS_ENABLED(CONFIG_UNICODE) +int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); +int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name); + /** * generic_ci_validate_strict_name - Check if a given name is suitable * for a directory -- cgit v1.2.3 From 5cd9aecbc72c07e8b83e900078669a7d0bc5f98f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 21 Oct 2024 13:37:23 -0300 Subject: tmpfs: Add flag FS_CASEFOLD_FL support for tmpfs dirs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable setting flag FS_CASEFOLD_FL for tmpfs directories, when tmpfs is mounted with casefold support. A special check is need for this flag, since it can't be set for non-empty directories. Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: André Almeida Link: https://lore.kernel.org/r/20241021-tonyk-tmpfs-v8-7-f443d5814194@igalia.com Signed-off-by: Christian Brauner --- include/linux/shmem_fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 515a9a6a3c6f..018da28c01e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -42,10 +42,10 @@ struct shmem_inode_info { struct inode vfs_inode; }; -#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_CASEFOLD_FL) #define SHMEM_FL_USER_MODIFIABLE \ - (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL) -#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL) + (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) +#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) struct shmem_quota_limits { qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */ -- cgit v1.2.3 From a2ad1b8101a36c927bcbd1317475b9576c352155 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 25 Oct 2024 06:11:51 -0700 Subject: mm/gup: Add folio_add_pins() Export a function that adds pins to an already-pinned huge-page folio. This allows any range of small pages within the folio to be unpinned later. For example, pages pinned via memfd_pin_folios and modified by folio_add_pins could be unpinned via unpin_user_page(s). Link: https://patch.msgid.link/r/1729861919-234514-2-git-send-email-steven.sistare@oracle.com Suggested-by: Jason Gunthorpe Suggested-by: David Hildenbrand Signed-off-by: Steve Sistare Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Signed-off-by: Jason Gunthorpe --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ecf63d2b0582..f9de33ac2add 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2524,6 +2524,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset); +int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); -- cgit v1.2.3 From 66418687ac895717dc2f6ddffe24cf9b74cd0d3e Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 10 Oct 2024 10:24:42 -0500 Subject: kernel/range: Const-ify range_contains parameters range_contains() does not modify the range values. David suggested it is safer to keep those parameters as const.[1] Make range parameters const Link: https://lore.kernel.org/all/20241008161032.GB1609@twin.jikos.cz/ [1] Reviewed-by: Dan Williams Reviewed-by: David Sterba Link: https://patch.msgid.link/20241010-const-range-v1-1-afb6e4bfd8ce@intel.com Signed-off-by: Ira Weiny Signed-off-by: Dave Jiang --- include/linux/range.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/range.h b/include/linux/range.h index 6ad0b73cb7ad..7dc5e835e079 100644 --- a/include/linux/range.h +++ b/include/linux/range.h @@ -13,7 +13,8 @@ static inline u64 range_len(const struct range *range) return range->end - range->start + 1; } -static inline bool range_contains(struct range *r1, struct range *r2) +static inline bool range_contains(const struct range *r1, + const struct range *r2) { return r1->start <= r2->start && r1->end >= r2->end; } -- cgit v1.2.3 From 0bbff9ed81654d5f06bfca484681756ee407f924 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 2 Oct 2024 13:43:24 -0500 Subject: perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control Armv8.9/9.4 PMUv3.9 adds per counter EL0 access controls. Per counter access is enabled with the UEN bit in PMUSERENR_EL1 register. Individual counters are enabled/disabled in the PMUACR_EL1 register. When UEN is set, the CR/ER bits control EL0 write access and must be set to disable write access. With the access controls, the clearing of unused counters can be skipped. KVM also configures PMUSERENR_EL1 in order to trap to EL2. UEN does not need to be set for it since only PMUv3.5 is exposed to guests. Signed-off-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20241002184326.1105499-1-robh@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 3372c1b56486..d698efba28a2 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -257,6 +257,7 @@ #define ARMV8_PMU_USERENR_SW (1 << 1) /* PMSWINC can be written at EL0 */ #define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ +#define ARMV8_PMU_USERENR_UEN (1 << 4) /* Fine grained per counter access at EL0 */ /* Mask for writable bits */ #define ARMV8_PMU_USERENR_MASK (ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \ ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER) -- cgit v1.2.3 From e1dce56443a4a18978fe39ee4af663e5b6b31422 Mon Sep 17 00:00:00 2001 From: Gowthami Thiagarajan Date: Mon, 28 Oct 2024 11:23:09 +0530 Subject: perf/marvell: Marvell PEM performance monitor support PCI Express Interface PMU includes various performance counters to monitor the data that is transmitted over the PCIe link. The counters track various inbound and outbound transactions which includes separate counters for posted/non-posted/completion TLPs. Also, inbound and outbound memory read requests along with their latencies can also be monitored. Address Translation Services(ATS)events such as ATS Translation, ATS Page Request, ATS Invalidation along with their corresponding latencies are also supported. The performance counters are 64 bits wide. For instance, perf stat -e ib_tlp_pr tracks the inbound posted TLPs for the workload. Co-developed-by: Linu Cherian Signed-off-by: Linu Cherian Signed-off-by: Gowthami Thiagarajan Link: https://lore.kernel.org/r/20241028055309.17893-1-gthiagarajan@marvell.com Signed-off-by: Will Deacon --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2361ed4d2b15..61d9a66d1807 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -227,6 +227,7 @@ enum cpuhp_state { CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE, CPUHP_AP_PERF_ARM_CAVIUM_TX2_UNCORE_ONLINE, CPUHP_AP_PERF_ARM_MARVELL_CN10K_DDR_ONLINE, + CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE, CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, -- cgit v1.2.3 From dc60de4eb0a431bde94e5abe55be6f646cdb435d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 24 Oct 2024 22:04:54 +0300 Subject: iio: acpi: Add iio_get_acpi_device_name_and_data() helper function A few drivers duplicate the code to retrieve ACPI device instance name. Some of them want an associated driver data as well. In order of deduplication introduce the common helper functions. Reviewed-by: Hans de Goede Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20241024191200.229894-6-andriy.shevchenko@linux.intel.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 3a9b57187a95..445d6666a291 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -831,6 +831,7 @@ int iio_device_resume_triggering(struct iio_dev *indio_dev); bool iio_read_acpi_mount_matrix(struct device *dev, struct iio_mount_matrix *orientation, char *acpi_method); +const char *iio_get_acpi_device_name_and_data(struct device *dev, const void **data); #else static inline bool iio_read_acpi_mount_matrix(struct device *dev, struct iio_mount_matrix *orientation, @@ -838,7 +839,16 @@ static inline bool iio_read_acpi_mount_matrix(struct device *dev, { return false; } +static inline const char * +iio_get_acpi_device_name_and_data(struct device *dev, const void **data) +{ + return NULL; +} #endif +static inline const char *iio_get_acpi_device_name(struct device *dev) +{ + return iio_get_acpi_device_name_and_data(dev, NULL); +} /** * iio_get_current_scan_type - Get the current scan type for a channel -- cgit v1.2.3 From 4261974701851630951e9ab31f0de4ade0faea53 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 25 Oct 2024 19:46:55 -0500 Subject: printf: Add print format (%pra) for struct range The use of struct range in the CXL subsystem is growing. In particular, the addition of Dynamic Capacity devices uses struct range in a number of places which are reported in debug and error messages. To wit requiring the printing of the start/end fields in each print became cumbersome. Dan Williams mentions in [1] that it might be time to have a print specifier for struct range similar to struct resource. A few alternatives were considered including '%par', '%r', and '%pn'. %pra follows that struct range is similar to struct resource (%p[rR]) but needs to be different. Based on discussions with Petr and Andy '%pra' was chosen.[2] Andy also suggested to keep the range prints similar to struct resource though combined code. Add hex_range() to handle printing for both pointer types. Finally introduce DEFINE_RANGE() as a parallel to DEFINE_RES_*() and use it in the tests. Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: open list Link: https://lore.kernel.org/all/663922b475e50_d54d72945b@dwillia2-xfh.jf.intel.com.notmuch/ [1] Link: https://lore.kernel.org/all/66cea3bf3332f_f937b29424@iweiny-mobl.notmuch/ [2] Suggested-by: Dan Williams Signed-off-by: Ira Weiny Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20241025-cxl-pra-v2-3-123a825daba2@intel.com Signed-off-by: Dave Jiang --- include/linux/range.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/range.h b/include/linux/range.h index 6ad0b73cb7ad..1358d4b1807a 100644 --- a/include/linux/range.h +++ b/include/linux/range.h @@ -31,4 +31,10 @@ int clean_sort_range(struct range *range, int az); void sort_range(struct range *range, int nr_range); +#define DEFINE_RANGE(_start, _end) \ +(struct range) { \ + .start = (_start), \ + .end = (_end), \ + } + #endif -- cgit v1.2.3 From 7c7e6c8924e7bf98db4e5b2edb202842003c00c2 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 24 Oct 2024 19:54:43 +0200 Subject: tty: serial: handle HAS_IOPORT dependencies In a future patch HAS_IOPORT=n will disable inb()/outb() and friends at compile time. We thus need to add HAS_IOPORT as dependency for those drivers using them unconditionally. Some 8250 serial drivers support MMIO only use, so fence only the parts requiring I/O ports and print an error message if a device can't be supported with the current configuration. Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Acked-by: Greg Kroah-Hartman Reviewed-by: Maciej W. Rozycki Signed-off-by: Niklas Schnelle Signed-off-by: Arnd Bergmann --- include/linux/serial_core.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 4ab65874a850..743b4afaad4c 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -505,7 +505,11 @@ struct uart_port { * The remaining bits are serial-core specific and not modifiable by * userspace. */ +#ifdef CONFIG_HAS_IOPORT #define UPF_FOURPORT ((__force upf_t) ASYNC_FOURPORT /* 1 */ ) +#else +#define UPF_FOURPORT 0 +#endif #define UPF_SAK ((__force upf_t) ASYNC_SAK /* 2 */ ) #define UPF_SPD_HI ((__force upf_t) ASYNC_SPD_HI /* 4 */ ) #define UPF_SPD_VHI ((__force upf_t) ASYNC_SPD_VHI /* 5 */ ) -- cgit v1.2.3 From f390525d27bc03dc5925293cbd9f22329648b248 Mon Sep 17 00:00:00 2001 From: Dingyan Li <18500469033@163.com> Date: Sun, 20 Oct 2024 15:47:21 +0800 Subject: usb: storage: fix wrong comments for struct bulk_cb_wrap In the flags, direction is in bit 7 instead of bit 0 based on the specification. Signed-off-by: Dingyan Li <18500469033@163.com> Link: https://lore.kernel.org/r/20241020074721.26905-1-18500469033@163.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/storage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/storage.h b/include/linux/usb/storage.h index 2827ce72e502..8539956bc2be 100644 --- a/include/linux/usb/storage.h +++ b/include/linux/usb/storage.h @@ -53,7 +53,7 @@ struct bulk_cb_wrap { __le32 Signature; /* contains 'USBC' */ __u32 Tag; /* unique per command id */ __le32 DataTransferLength; /* size of data */ - __u8 Flags; /* direction in bit 0 */ + __u8 Flags; /* direction in bit 7 */ __u8 Lun; /* LUN normally 0 */ __u8 Length; /* length of the CDB */ __u8 CDB[16]; /* max command */ -- cgit v1.2.3 From 4fd06a5358e0d888d1bf23d274971ea7d1f45aad Mon Sep 17 00:00:00 2001 From: David Dai Date: Wed, 18 Sep 2024 17:08:33 -0700 Subject: cpufreq: add virtual-cpufreq driver Introduce a virtualized cpufreq driver for guest kernels to improve performance and power of workloads within VMs. This driver does two main things: 1. Sends the frequency of vCPUs as a hint to the host. The host uses the hint to schedule the vCPU threads and decide physical CPU frequency. 2. If a VM does not support a virtualized FIE(like AMUs), it queries the host CPU frequency by reading a MMIO region of a virtual cpufreq device to update the guest's frequency scaling factor periodically. This enables accurate Per-Entity Load Tracking for tasks running in the guest. Co-developed-by: Saravana Kannan Signed-off-by: Saravana Kannan Signed-off-by: David Dai Signed-off-by: Viresh Kumar --- include/linux/arch_topology.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index b721f360d759..d5d848849408 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -49,6 +49,7 @@ enum scale_freq_source { SCALE_FREQ_SOURCE_CPUFREQ = 0, SCALE_FREQ_SOURCE_ARCH, SCALE_FREQ_SOURCE_CPPC, + SCALE_FREQ_SOURCE_VIRT, }; struct scale_freq_data { -- cgit v1.2.3 From 5935b8377a0f3b2401e4e487336ed90fe6b9254d Mon Sep 17 00:00:00 2001 From: Sui Jingfeng Date: Wed, 23 Oct 2024 11:25:44 +0800 Subject: dma-mapping: remove an outdated comment from dma-map-ops.h The "/* CONFIG_ARCH_HAS_DMA_COHERENCE_H */" was an description about the ARCH_HAS_DMA_COHERENCE_H configure option, but it has been removed since the dma_default_coherent variable was lifted from the mips architecture to the driver core. Therefore it doesn't match any compile guard now. Just remove it. Fixes: 6d4e9a8efe3d ("driver core: lift dma_default_coherent into common code") Signed-off-by: Sui Jingfeng Signed-off-by: Christoph Hellwig --- include/linux/dma-map-ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index b7773201414c..e172522cd936 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -242,7 +242,7 @@ static inline bool dev_is_dma_coherent(struct device *dev) { return true; } -#endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */ +#endif static inline void dma_reset_need_sync(struct device *dev) { -- cgit v1.2.3 From be164349e173a8e71cd76f17c7ed720813b8d69b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 Oct 2024 07:19:48 +0200 Subject: dma-mapping: drop unneeded includes from dma-mapping.h Back in the day a lot of logic was implemented inline in dma-mapping.h and needed various includes. Move of this has long been moved out of line, so we can drop various includes to improve kernel rebuild times. Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 1524da363734..b79925b1c433 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -2,15 +2,11 @@ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H -#include -#include -#include #include #include #include #include #include -#include /** * List of possible attributes associated with a DMA mapping. The semantics -- cgit v1.2.3 From 69e5a17511f654db0fe058d22719cef008c9faf1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 18 Oct 2024 14:01:19 -0300 Subject: iommu: Remove useless flush from iommu_create_device_direct_mappings() These days iommu_map() does not require external flushing, it always internally handles any required flushes. Since iommu_create_device_direct_mappings() only calls iommu_map(), remove the extra call. Since this is the last call site for iommu_flush_iotlb_all() remove it too. Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/0-v1-bb6c694e1b07+a29e1-iommu_no_flush_all_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 62d1b85c80d3..03f9fa833db9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -852,12 +852,6 @@ void iommu_set_dma_strict(void); extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags); -static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) -{ - if (domain->ops->flush_iotlb_all) - domain->ops->flush_iotlb_all(domain); -} - static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { @@ -1141,10 +1135,6 @@ static inline ssize_t iommu_map_sg(struct iommu_domain *domain, return -ENODEV; } -static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) -{ -} - static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { -- cgit v1.2.3 From f6440fcc9c7b7aaad2e52830ab83fa71990f76ff Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 9 Oct 2024 12:11:47 +0800 Subject: iommu: Remove iommu_domain_alloc() The iommu_domain_alloc() interface is no longer used in the tree anymore. Remove it to avoid dead code. There is increasing demand for supporting multiple IOMMU drivers, and this is the last bus-based thing standing in the way of that. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20241009041147.28391-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 03f9fa833db9..d93c87f0c003 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -787,7 +787,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) extern int bus_iommu_probe(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_group_has_isolated_msi(struct iommu_group *group); -extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus); struct iommu_domain *iommu_paging_domain_alloc(struct device *dev); extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, @@ -1079,11 +1078,6 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } -static inline struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) -{ - return NULL; -} - static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From 20858d4ebb423826b7a978d54cde195f51d87e20 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 28 Oct 2024 09:38:00 +0000 Subject: iommu: Introduce iommu_paging_domain_alloc_flags() Currently drivers calls iommu_paging_domain_alloc(dev) to get an UNMANAGED domain. This is not sufficient to support PASID with UNMANAGED domain as some HW like AMD requires certain page table type to support PASIDs. Also the domain_alloc_paging op only passes device as param for domain allocation. This is not sufficient for AMD driver to decide the right page table. Instead of extending ops->domain_alloc_paging() it was decided to enhance ops->domain_alloc_user() so that caller can pass various additional flags. Hence add iommu_paging_domain_alloc_flags() API which takes flags as parameter. Caller can pass additional parameter to indicate type of domain required, etc. iommu_paging_domain_alloc_flags() internally calls appropriate callback function to allocate a domain. Signed-off-by: Jason Gunthorpe [Added description - Vasant] Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Yi Liu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20241028093810.5901-3-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d93c87f0c003..aa78d911fdda 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -511,8 +511,6 @@ static inline int __iommu_copy_struct_from_user_array( * the caller iommu_domain_alloc() returns. * @domain_alloc_user: Allocate an iommu domain corresponding to the input * parameters as defined in include/uapi/linux/iommufd.h. - * Unlike @domain_alloc, it is called only by IOMMUFD and - * must fully initialize the new domain before return. * Upon success, if the @user_data is valid and the @parent * points to a kernel-managed domain, the new domain must be * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be @@ -787,7 +785,11 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) extern int bus_iommu_probe(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_group_has_isolated_msi(struct iommu_group *group); -struct iommu_domain *iommu_paging_domain_alloc(struct device *dev); +struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags); +static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) +{ + return iommu_paging_domain_alloc_flags(dev, 0); +} extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); @@ -1078,6 +1080,12 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } +struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int flags) +{ + return ERR_PTR(-ENODEV); +} + static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From b6da940130579769a42605b2c7f529b6f14ef1f8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 9 Oct 2024 16:29:37 +0200 Subject: mm, slab: add kerneldocs for common SLAB_ flags We have many SLAB_ flags but many are used only internally, by kunit tests or debugging subsystems cooperating with slab, or are set according to slab_debug boot parameter. Create kerneldocs for the commonly used flags that may be passed to kmem_cache_create(). SLAB_TYPESAFE_BY_RCU already had a detailed description, so turn it to a kerneldoc. Add some details for SLAB_ACCOUNT, SLAB_RECLAIM_ACCOUNT and SLAB_HWCACHE_ALIGN. Reference them from the __kmem_cache_create_args() kerneldoc. Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 60 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index b35e2db7eb0e..49e9fb93e864 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -77,7 +77,17 @@ enum _slab_flag_bits { #define SLAB_POISON __SLAB_FLAG_BIT(_SLAB_POISON) /* Indicate a kmalloc slab */ #define SLAB_KMALLOC __SLAB_FLAG_BIT(_SLAB_KMALLOC) -/* Align objs on cache lines */ +/** + * define SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries. + * + * Sufficiently large objects are aligned on cache line boundary. For object + * size smaller than a half of cache line size, the alignment is on the half of + * cache line size. In general, if object size is smaller than 1/2^n of cache + * line size, the alignment is adjusted to 1/2^n. + * + * If explicit alignment is also requested by the respective + * &struct kmem_cache_args field, the greater of both is alignments is applied. + */ #define SLAB_HWCACHE_ALIGN __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN) /* Use GFP_DMA memory */ #define SLAB_CACHE_DMA __SLAB_FLAG_BIT(_SLAB_CACHE_DMA) @@ -87,8 +97,8 @@ enum _slab_flag_bits { #define SLAB_STORE_USER __SLAB_FLAG_BIT(_SLAB_STORE_USER) /* Panic if kmem_cache_create() fails */ #define SLAB_PANIC __SLAB_FLAG_BIT(_SLAB_PANIC) -/* - * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! +/** + * define SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * * This delays freeing the SLAB page by a grace period, it does _NOT_ * delay object freeing. This means that if you do kmem_cache_free() @@ -99,20 +109,22 @@ enum _slab_flag_bits { * stays valid, the trick to using this is relying on an independent * object validation pass. Something like: * - * begin: - * rcu_read_lock(); - * obj = lockless_lookup(key); - * if (obj) { - * if (!try_get_ref(obj)) // might fail for free objects - * rcu_read_unlock(); - * goto begin; + * :: + * + * begin: + * rcu_read_lock(); + * obj = lockless_lookup(key); + * if (obj) { + * if (!try_get_ref(obj)) // might fail for free objects + * rcu_read_unlock(); + * goto begin; * - * if (obj->key != key) { // not the object we expected - * put_ref(obj); - * rcu_read_unlock(); - * goto begin; - * } - * } + * if (obj->key != key) { // not the object we expected + * put_ref(obj); + * rcu_read_unlock(); + * goto begin; + * } + * } * rcu_read_unlock(); * * This is useful if we need to approach a kernel structure obliquely, @@ -137,7 +149,6 @@ enum _slab_flag_bits { * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -/* Defer freeing slabs to RCU */ #define SLAB_TYPESAFE_BY_RCU __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU) /* Trace allocations and frees */ #define SLAB_TRACE __SLAB_FLAG_BIT(_SLAB_TRACE) @@ -170,7 +181,12 @@ enum _slab_flag_bits { #else # define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif -/* Account to memcg */ +/** + * define SLAB_ACCOUNT - Account allocations to memcg. + * + * All object allocations from this cache will be memcg accounted, regardless of + * __GFP_ACCOUNT being or not being passed to individual allocations. + */ #ifdef CONFIG_MEMCG # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else @@ -197,7 +213,13 @@ enum _slab_flag_bits { #endif /* The following flags affect the page allocator grouping pages by mobility */ -/* Objects are reclaimable */ +/** + * define SLAB_RECLAIM_ACCOUNT - Objects are reclaimable. + * + * Use this flag for caches that have an associated shrinker. As a result, slab + * pages are allocated with __GFP_RECLAIMABLE, which affects grouping pages by + * mobility, and are accounted in SReclaimable counter in /proc/meminfo + */ #ifndef CONFIG_SLUB_TINY #define SLAB_RECLAIM_ACCOUNT __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT) #else -- cgit v1.2.3 From 68f99be287a59d50a9ad231d523f7e578f8bd28a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:00 +0200 Subject: signal: Confine POSIX_TIMERS properly Move the itimer rearming out of the signal code and consolidate all posix timer related functions in the signal code under one ifdef. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.314100569@linutronix.de --- include/linux/posix-timers.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 453691710839..670bf03a56ef 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -100,6 +100,8 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } +void posixtimer_rearm_itimer(struct task_struct *p); +void posixtimer_rearm(struct kernel_siginfo *info); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -122,6 +124,8 @@ struct cpu_timer { }; static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } +static inline void posixtimer_rearm_itimer(struct task_struct *p) { } +static inline void posixtimer_rearm(struct kernel_siginfo *info) { } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK @@ -196,5 +200,4 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); -void posixtimer_rearm(struct kernel_siginfo *info); #endif -- cgit v1.2.3 From 4febce44cfebcb490b196d5d10ae9f403ca4c956 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:03 +0200 Subject: posix-timers: Cure si_sys_private race The si_sys_private member of the siginfo which is embedded in the preallocated sigqueue is used by the posix timer code to decide whether a timer must be reprogrammed on signal delivery. The handling of this is racy as a long standing comment in that code documents. It is modified with the timer lock held, but without sighand lock being held. The actual signal delivery code checks for it under sighand lock without holding the timer lock. Hand the new value to send_sigqueue() as argument and store it with sighand lock held. This is an intermediate change to address this issue. The arguments to this function will be cleanup in subsequent changes. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.434338954@linutronix.de --- include/linux/sched/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index c8ed09ac29ac..bd9f569231d9 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -340,7 +340,7 @@ extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); +extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type, int si_private); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) -- cgit v1.2.3 From c775ea28d4e23f5e58b6953645ef90c1b27a8e83 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:04 +0200 Subject: signal: Allow POSIX timer signals to be dropped In case that a timer was reprogrammed or deleted an already pending signal is obsolete. Right now such signals are kept around and eventually delivered. While POSIX is blury about this: - "The effect of disarming or resetting a timer with pending expiration notifications is unspecified." - "The disposition of pending signals for the deleted timer is unspecified." it is reasonable in both cases to expect that pending signals are discarded as they have no meaning anymore. Prepare the signal code to allow dropping posix timer signals. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.494416923@linutronix.de --- include/linux/posix-timers.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 670bf03a56ef..4ab49e5c42af 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -100,8 +100,9 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } + void posixtimer_rearm_itimer(struct task_struct *p); -void posixtimer_rearm(struct kernel_siginfo *info); +bool posixtimer_deliver_signal(struct kernel_siginfo *info); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -125,7 +126,7 @@ static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } -static inline void posixtimer_rearm(struct kernel_siginfo *info) { } +static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK -- cgit v1.2.3 From cd1e93aedab7f749760a33e9e094381973b1120e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:07 +0200 Subject: posix-timers: Rename k_itimer:: It_requeue_pending Prepare for using this struct member to do a proper reprogramming and deletion accounting so that stale signals can be dropped. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.611997737@linutronix.de --- include/linux/posix-timers.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 4ab49e5c42af..253d106fac2c 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -150,8 +150,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_active: Marker that timer is active * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal - * @it_requeue_pending: Indicator that timer waits for being requeued on - * signal delivery + * @it_signal_seq: Sequence count to control signal delivery * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct @@ -172,7 +171,7 @@ struct k_itimer { int it_active; s64 it_overrun; s64 it_overrun_last; - int it_requeue_pending; + unsigned int it_signal_seq; int it_sigev_notify; ktime_t it_interval; struct signal_struct *it_signal; -- cgit v1.2.3 From 1550dde8a537b35dbf066c7f9cfe5f9b360bce0d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Oct 2024 10:42:09 +0200 Subject: posix-timers: Add proper state tracking Right now the state tracking is done by two struct members: - it_active: A boolean which tracks armed/disarmed state - it_signal_seq: A sequence counter which is used to invalidate settings and prevent rearming Replace it_active with it_status and keep properly track about the states in one place. This allows to reuse it_signal_seq to track reprogramming, disarm and delete operations in order to drop signals which are related to the state previous of those operations. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241001083835.670337048@linutronix.de --- include/linux/posix-timers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 253d106fac2c..02afbb4da7f7 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -147,7 +147,7 @@ static inline void posix_cputimers_init_work(void) { } * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer - * @it_active: Marker that timer is active + * @it_status: The status of the timer * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery @@ -168,7 +168,7 @@ struct k_itimer { const struct k_clock *kclock; clockid_t it_clock; timer_t it_id; - int it_active; + int it_status; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; -- cgit v1.2.3 From 9cb7e40d388d6c0e4677809c6b2950bc67fd8830 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:28 -0700 Subject: rtnetlink: Make per-netns RTNL dereference helpers to macro. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_DEBUG_NET_SMALL_RTNL is off, rtnl_net_dereference() is the static inline wrapper of rtnl_dereference() returning a plain (void *) pointer to make sure net is always evaluated as requested in [0]. But, it makes sparse complain [1] when the pointer has __rcu annotation: net/ipv4/devinet.c:674:47: sparse: warning: incorrect type in argument 2 (different address spaces) net/ipv4/devinet.c:674:47: sparse: expected void *p net/ipv4/devinet.c:674:47: sparse: got struct in_ifaddr [noderef] __rcu * Also, if we evaluate net as (void *) in a macro, then the compiler in turn fails to build due to -Werror=unused-value. #define rtnl_net_dereference(net, p) \ ({ \ (void *)net; \ rtnl_dereference(p); \ }) net/ipv4/devinet.c: In function ‘inet_rtm_deladdr’: ./include/linux/rtnetlink.h:154:17: error: statement with no effect [-Werror=unused-value] 154 | (void *)net; \ net/ipv4/devinet.c:674:21: note: in expansion of macro ‘rtnl_net_dereference’ 674 | (ifa = rtnl_net_dereference(net, *ifap)) != NULL; | ^~~~~~~~~~~~~~~~~~~~ Let's go back to the original simplest macro. Note that checkpatch complains about this approach, but it's one-shot and less noisy than the other two. WARNING: Argument 'net' is not used in function-like macro #76: FILE: include/linux/rtnetlink.h:142: +#define rtnl_net_dereference(net, p) \ + rtnl_dereference(p) Fixes: 844e5e7e656d ("rtnetlink: Add assertion helpers for per-netns RTNL.") Link: https://lore.kernel.org/netdev/20241004132145.7fd208e9@kernel.org/ [0] Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410200325.SaEJmyZS-lkp@intel.com/ [1] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 8468a4ce8510..0e62918de63b 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -137,21 +137,12 @@ static inline void ASSERT_RTNL_NET(struct net *net) ASSERT_RTNL(); } -static inline void *rcu_dereference_rtnl_net(struct net *net, void *p) -{ - return rcu_dereference_rtnl(p); -} - -static inline void *rtnl_net_dereference(struct net *net, void *p) -{ - return rtnl_dereference(p); -} - -static inline void *rcu_replace_pointer_rtnl_net(struct net *net, - void *rp, void *p) -{ - return rcu_replace_pointer_rtnl(rp, p); -} +#define rcu_dereference_rtnl_net(net, p) \ + rcu_dereference_rtnl(p) +#define rtnl_net_dereference(net, p) \ + rtnl_dereference(p) +#define rcu_replace_pointer_rtnl_net(net, rp, p) \ + rcu_replace_pointer_rtnl(rp, p) #endif static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) -- cgit v1.2.3 From d4b483208b2606add41a22bdd3c8cd6d36009319 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:33 -0700 Subject: ipv4: Use per-netns RTNL helpers in inet_rtm_newaddr(). inet_rtm_to_ifa() and find_matching_ifa() are called under rtnl_net_lock(). __in_dev_get_rtnl() and in_dev_for_each_ifa_rtnl() there can use per-netns RTNL helpers. Let's define and use __in_dev_get_rtnl_net() and in_dev_for_each_ifa_rtnl_net(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/inetdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index d9c690c8c80b..5730ba6b1cfa 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -226,6 +226,10 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr) for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa; \ ifa = rtnl_dereference(ifa->ifa_next)) +#define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) \ + for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa; \ + ifa = rtnl_net_dereference(net, ifa->ifa_next)) + #define in_dev_for_each_ifa_rcu(ifa, in_dev) \ for (ifa = rcu_dereference((in_dev)->ifa_list); ifa; \ ifa = rcu_dereference(ifa->ifa_next)) @@ -252,6 +256,11 @@ static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev) return rtnl_dereference(dev->ip_ptr); } +static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev) +{ + return rtnl_net_dereference(dev_net(dev), dev->ip_ptr); +} + /* called with rcu_read_lock or rtnl held */ static inline bool ip_ignore_linkdown(const struct net_device *dev) { -- cgit v1.2.3 From d1c81818aa227b37d65b40f9883109c5256b9bfb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:36 -0700 Subject: rtnetlink: Define rtnl_net_trylock(). We will need the per-netns version of rtnl_trylock(). rtnl_net_trylock() calls __rtnl_net_lock() only when rtnl_trylock() successfully holds RTNL. When RTNL is removed, we will use mutex_trylock() for per-netns RTNL. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 0e62918de63b..14b88f551920 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -101,6 +101,7 @@ void __rtnl_net_lock(struct net *net); void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); +int rtnl_net_trylock(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); bool rtnl_net_is_locked(struct net *net); @@ -132,6 +133,11 @@ static inline void rtnl_net_unlock(struct net *net) rtnl_unlock(); } +static inline int rtnl_net_trylock(struct net *net) +{ + return rtnl_trylock(); +} + static inline void ASSERT_RTNL_NET(struct net *net) { ASSERT_RTNL(); -- cgit v1.2.3 From d14772c0d88c387f881a577aa136e1e9b1291d07 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 29 Oct 2024 11:54:25 +0100 Subject: iommu: Fix prototype of iommu_paging_domain_alloc_flags() The iommu_paging_domain_alloc_flags() prototype for non-iommu kernel configurations lacks the 'static inline' prefixes. Cc: Jason Gunthorpe Cc: Vasant Hegde Fixes: 20858d4ebb42 ("iommu: Introduce iommu_paging_domain_alloc_flags()") Reviewed-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index aa78d911fdda..522efdc7d815 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1080,7 +1080,7 @@ static inline bool device_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } -struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, +static inline struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From a33bf8d8ce7e06bf0f033865b0cea5887cd2ac8c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 29 Oct 2024 12:11:49 +0100 Subject: iommu: Restore iommu_flush_iotlb_all() This patch restores the iommu_flush_iotlb_all() function. Commit 69e5a17511f6 ("iommu: Remove useless flush from iommu_create_device_direct_mappings()") claims it removed the last call-site, except it did not. There is still at least one caller in drivers/gpu/drm/msm/msm_iommu.c so keep the function around until all call-sites are updated. Cc: Jason Gunthorpe Fixes: 69e5a17511f6 ("iommu: Remove useless flush from iommu_create_device_direct_mappings()") Acked-by: Will Deacon Reviewed-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 522efdc7d815..8cce372a33f1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -853,6 +853,12 @@ void iommu_set_dma_strict(void); extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags); +static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + if (domain->ops->flush_iotlb_all) + domain->ops->flush_iotlb_all(domain); +} + static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { @@ -1137,6 +1143,10 @@ static inline ssize_t iommu_map_sg(struct iommu_domain *domain, return -ENODEV; } +static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) +{ +} + static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { -- cgit v1.2.3 From c382429b587ac49bd179d768f13e7fa5e7ed1787 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sat, 26 Oct 2024 21:38:02 +0200 Subject: platform/x86: wmi: Replace dev_to_wdev() with to_wmi_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace dev_to_wdev() with to_wmi_device() to stop duplicating functionality. Also switch to_wmi_device() to use container_of_const() so const values are handled correctly. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20241026193803.8802-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 120019677fc6..fca5fd43c707 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -34,7 +34,7 @@ struct wmi_device { * * Cast a struct device to a struct wmi_device. */ -#define to_wmi_device(device) container_of(device, struct wmi_device, dev) +#define to_wmi_device(device) container_of_const(device, struct wmi_device, dev) extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance, u32 method_id, -- cgit v1.2.3 From e001341a984e709e377b275123aecb5a763eaef9 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sat, 26 Oct 2024 21:38:03 +0200 Subject: platform/x86: wmi: Introduce to_wmi_driver() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce to_wmi_driver() as a replacement for dev_to_wdrv() so WMI drivers can use this support macro instead of having to duplicate its functionality. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20241026193803.8802-3-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index fca5fd43c707..10751c8e5e6a 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -73,6 +73,14 @@ struct wmi_driver { void (*notify)(struct wmi_device *device, union acpi_object *data); }; +/** + * to_wmi_driver() - Helper macro to cast a driver to a wmi_driver + * @drv: driver struct + * + * Cast a struct device_driver to a struct wmi_driver. + */ +#define to_wmi_driver(drv) container_of_const(drv, struct wmi_driver, driver) + extern int __must_check __wmi_driver_register(struct wmi_driver *driver, struct module *owner); extern void wmi_driver_unregister(struct wmi_driver *driver); -- cgit v1.2.3 From 2f5a65ef30a636d5030917eebd283ac447a212af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2024 15:19:37 +0100 Subject: block: add a bdev_limits helper Add a helper to get the queue_limits from the bdev without having to poke into the request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20241029141937.249920-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d0a52ed05e60..7bfc877e159e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1159,6 +1159,11 @@ enum blk_default_limits { */ #define BLK_DEF_MAX_SECTORS_CAP 2560u +static inline struct queue_limits *bdev_limits(struct block_device *bdev) +{ + return &bdev_get_queue(bdev)->limits; +} + static inline unsigned long queue_segment_boundary(const struct request_queue *q) { return q->limits.seg_boundary_mask; @@ -1293,23 +1298,23 @@ unsigned int bdev_discard_alignment(struct block_device *bdev); static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.max_discard_sectors; + return bdev_limits(bdev)->max_discard_sectors; } static inline unsigned int bdev_discard_granularity(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.discard_granularity; + return bdev_limits(bdev)->discard_granularity; } static inline unsigned int bdev_max_secure_erase_sectors(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.max_secure_erase_sectors; + return bdev_limits(bdev)->max_secure_erase_sectors; } static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.max_write_zeroes_sectors; + return bdev_limits(bdev)->max_write_zeroes_sectors; } static inline bool bdev_nonrot(struct block_device *bdev) @@ -1345,7 +1350,7 @@ static inline bool bdev_write_cache(struct block_device *bdev) static inline bool bdev_fua(struct block_device *bdev) { - return bdev_get_queue(bdev)->limits.features & BLK_FEAT_FUA; + return bdev_limits(bdev)->features & BLK_FEAT_FUA; } static inline bool bdev_nowait(struct block_device *bdev) -- cgit v1.2.3 From 085268829b07202cf7bf8ec1a8fb7fd9d8f6a41a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Sep 2024 14:22:36 -0600 Subject: io_uring/poll: get rid of unlocked cancel hash io_uring maintains two hash lists of inflight requests: 1) ctx->cancel_table_locked. This is used when the caller has the ctx->uring_lock held already. This is only an issue side parameter, as removal or task_work will always have it held. 2) ctx->cancel_table. This is used when the issuer does NOT have the ctx->uring_lock held, and relies on the table spinlocks for access. However, it's pretty trivial to simply grab the lock in the one spot where we care about it, for insertion. With that, we can kill the unlocked table (and get rid of the _locked postfix for the other one). Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4b9ba523978d..d8ca27da1341 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -291,7 +291,7 @@ struct io_ring_ctx { struct xarray io_bl_xa; - struct io_hash_table cancel_table_locked; + struct io_hash_table cancel_table; struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; struct io_alloc_cache rw_cache; @@ -342,7 +342,6 @@ struct io_ring_ctx { struct list_head io_buffers_comp; struct list_head cq_overflow_list; - struct io_hash_table cancel_table; struct hlist_head waitid_list; @@ -459,7 +458,6 @@ enum { REQ_F_DOUBLE_POLL_BIT, REQ_F_APOLL_MULTISHOT_BIT, REQ_F_CLEAR_POLLIN_BIT, - REQ_F_HASH_LOCKED_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -534,8 +532,6 @@ enum { REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), /* recvmsg special flag, clear EPOLLIN */ REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), - /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ - REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), /* file is pollable */ -- cgit v1.2.3 From ba4366f57b117c2eab996642288e5c75646ccfc9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Sep 2024 14:29:06 -0600 Subject: io_uring/poll: get rid of per-hashtable bucket locks Any access to the table is protected by ctx->uring_lock now anyway, the per-bucket locking doesn't buy us anything. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d8ca27da1341..9c7e1d3f06e5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -67,7 +67,6 @@ struct io_file_table { }; struct io_hash_bucket { - spinlock_t lock; struct hlist_head list; } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 2946f08ae9ed650b94e0ffebcdfdda8de76bd926 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 18 Oct 2024 17:14:00 +0100 Subject: io_uring: clean up cqe trace points We have too many helpers posting CQEs, instead of tracing completion events before filling in a CQE and thus having to pass all the data, set the CQE first, pass it to the tracing helper and let it extract everything it needs. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b83c1ca9ee5aed2df0f3bb743bf5ed699cce4c86.1729267437.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9c7e1d3f06e5..391087144666 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -662,4 +662,9 @@ struct io_overflow_cqe { struct io_uring_cqe cqe; }; +static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) +{ + return ctx->flags & IORING_SETUP_CQE32; +} + #endif -- cgit v1.2.3 From e6d43739d0ee49a39505d696ba6a656f47c2bd39 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2024 15:54:06 -0600 Subject: io_uring: kill 'imu' from struct io_kiocb It's no longer being used, remove it. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 391087144666..6d3ee71bd832 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -613,9 +613,6 @@ struct io_kiocb { struct task_struct *task; union { - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; -- cgit v1.2.3 From 79cfe9e59c2a12c3b3faeeefe38d23f3d8030972 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 21 Oct 2024 13:34:10 -0600 Subject: io_uring/register: add IORING_REGISTER_RESIZE_RINGS Once a ring has been created, the size of the CQ and SQ rings are fixed. Usually this isn't a problem on the SQ ring side, as it merely controls the available number of requests that can be submitted in a single system call, and there's rarely a need to change that. For the CQ ring, it's a different story. For most efficient use of io_uring, it's important that the CQ ring never overflows. This means that applications must size it for the worst case scenario, which can be wasteful. Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize the existing rings. It takes a struct io_uring_params argument, the same one which is used to setup the ring initially, and resizes rings according to the sizes given. Certain properties are always inherited from the original ring setup, like SQE128/CQE32 and other setup options. The implementation only allows flag associated with how the CQ ring is sized and clamped. Existing unconsumed SQE and CQE entries are copied as part of the process. If either the SQ or CQ resized destination ring cannot hold the entries already present in the source rings, then the operation is failed with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new submissions, and the internal mapping holds the completion lock as well across moving CQ ring state. To prevent races between mmap and ring resizing, add a mutex that's solely used to serialize ring resize and mmap. mmap_sem can't be used here, as as fork'ed process may be doing mmaps on the ring as well. The ctx->resize_lock is held across mmap operations, and the resize will grab it before swapping out the already mapped new data. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6d3ee71bd832..841579dcdae9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -415,6 +415,13 @@ struct io_ring_ctx { /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + /* + * Protection for resize vs mmap races - both the mmap and resize + * side will need to grab this lock, to prevent either side from + * being run concurrently with the other. + */ + struct mutex resize_lock; + /* * If IORING_SETUP_NO_MMAP is used, then the below holds * the gup'ed pages for the two rings, and the sqes. -- cgit v1.2.3 From aa00f67adc2c0d6439f81b5a81ff181377c47a7e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Oct 2024 13:47:00 -0600 Subject: io_uring: add support for fixed wait regions Generally applications have 1 or a few waits of waiting, yet they pass in a struct io_uring_getevents_arg every time. This needs to get copied and, in turn, the timeout value needs to get copied. Rather than do this for every invocation, allow the application to register a fixed set of wait regions that can simply be indexed when asking the kernel to wait on events. At ring setup time, the application can register a number of these wait regions and initialize region/index 0 upfront: struct io_uring_reg_wait *reg; reg = io_uring_setup_reg_wait(ring, nr_regions, &ret); /* set timeout and mark as set, sigmask/sigmask_sz as needed */ reg->ts.tv_sec = 0; reg->ts.tv_nsec = 100000; reg->flags = IORING_REG_WAIT_TS; where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The above initializes index 0, but 63 other regions can be initialized, if needed. Now, instead of doing: struct __kernel_timespec timeout = { .tv_nsec = 100000, }; io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL); to wait for events for each submit_and_wait, or just wait, operation, it can just reference the above region at offset 0 and do: io_uring_submit_and_wait_reg(ring, &cqe, nr, 0); to achieve the same goal of waiting 100usec without needing to copy both struct io_uring_getevents_arg (24b) and struct __kernel_timeout (16b) for each invocation. Struct io_uring_reg_wait looks as follows: struct io_uring_reg_wait { struct __kernel_timespec ts; __u32 min_wait_usec; __u32 flags; __u64 sigmask; __u32 sigmask_sz; __u32 pad[3]; __u64 pad2[2]; }; embedding the timeout itself in the region, rather than passing it as a pointer as well. Note that the signal mask is still passed as a pointer, both for compatability reasons, but also because there doesn't seem to be a lot of high frequency waits scenarios that involve setting and resetting the signal mask for each wait. The application is free to modify any region before a wait call, or it can use keep multiple regions with different settings to avoid needing to modify the same one for wait calls. Up to a page size of regions is mapped by default, allowing PAGE_SIZE / 64 available regions for use. The registered region must fit within a page. On a 4kb page size system, that allows for 64 wait regions if a full page is used, as the size of struct io_uring_reg_wait is 64b. The region registered must be aligned to io_uring_reg_wait in size. It's valid to register less than 64 entries. In network performance testing with zero-copy, this reduced the time spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4% to 0.3%. Wait regions are fixed for the lifetime of the ring - once registered, they are persistent until the ring is torn down. The regions support minimum wait timeout as well as the regular waits. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 841579dcdae9..2f12828b22a4 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -327,6 +327,14 @@ struct io_ring_ctx { atomic_t cq_wait_nr; atomic_t cq_timeouts; struct wait_queue_head cq_wait; + + /* + * If registered with IORING_REGISTER_CQWAIT_REG, a single + * page holds N entries, mapped in cq_wait_arg. cq_wait_index + * is the maximum allowable index. + */ + struct io_uring_reg_wait *cq_wait_arg; + unsigned char cq_wait_index; } ____cacheline_aligned_in_smp; /* timeouts */ @@ -430,6 +438,8 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; + + struct page **cq_wait_page; }; struct io_tw_state { -- cgit v1.2.3 From ff1256b8f3c45f222bce19fbfc1e1bc498b31d03 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 25 Oct 2024 08:54:28 -0600 Subject: io_uring/rsrc: move struct io_fixed_file to rsrc.h header There's no need for this internal structure to be visible, move it to the private rsrc.h header instead. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 2f12828b22a4..d4ba4ae480d6 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -55,11 +55,6 @@ struct io_wq_work { int cancel_seq; }; -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - struct io_file_table { struct io_fixed_file *files; unsigned long *bitmap; -- cgit v1.2.3 From b2473a359763e27567993e7d8f37de82f57a0829 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 23 Oct 2024 18:14:26 +0100 Subject: of/fdt: add dt_phys arg to early_init_dt_scan and early_init_dt_verify __pa() is only intended to be used for linear map addresses and using it for initial_boot_params which is in fixmap for arm64 will give an incorrect value. Hence save the physical address when it is known at boot time when calling early_init_dt_scan for arm64 and use it at kexec time instead of converting the virtual address using __pa(). Note that arm64 doesn't need the FDT region reserved in the DT as the kernel explicitly reserves the passed in FDT. Therefore, only a debug warning is fixed with this change. Reported-by: Breno Leitao Suggested-by: Mark Rutland Signed-off-by: Usama Arif Fixes: ac10be5cdbfa ("arm64: Use common of_kexec_alloc_and_setup_fdt()") Link: https://lore.kernel.org/r/20241023171426.452688-1-usamaarif642@gmail.com Signed-off-by: Rob Herring (Arm) --- include/linux/of_fdt.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index d69ad5bb1eb1..b8d6c0c20876 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -31,6 +31,7 @@ extern void *of_fdt_unflatten_tree(const unsigned long *blob, extern int __initdata dt_root_addr_cells; extern int __initdata dt_root_size_cells; extern void *initial_boot_params; +extern phys_addr_t initial_boot_params_pa; extern char __dtb_start[]; extern char __dtb_end[]; @@ -70,8 +71,8 @@ extern u64 dt_mem_next_cell(int s, const __be32 **cellp); /* Early flat tree scan hooks */ extern int early_init_dt_scan_root(void); -extern bool early_init_dt_scan(void *params); -extern bool early_init_dt_verify(void *params); +extern bool early_init_dt_scan(void *dt_virt, phys_addr_t dt_phys); +extern bool early_init_dt_verify(void *dt_virt, phys_addr_t dt_phys); extern void early_init_dt_scan_nodes(void); extern const char *of_flat_dt_get_machine_name(void); -- cgit v1.2.3 From 90ee6ed776c06435a3fe79c7f5344761f52e1760 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 7 Oct 2024 16:23:59 +0200 Subject: fs: port files to file_ref Port files to rely on file_ref reference to improve scaling and gain overflow protection. - We continue to WARN during get_file() in case a file that is already marked dead is revived as get_file() is only valid if the caller already holds a reference to the file. This hasn't changed just the check changes. - The semantics for epoll and ttm's dmabuf usage have changed. Both epoll and ttm synchronize with __fput() to prevent the underlying file from beeing freed. (1) epoll Explaining epoll is straightforward using a simple diagram. Essentially, the mutex of the epoll instance needs to be taken in both __fput() and around epi_fget() preventing the file from being freed while it is polled or preventing the file from being resurrected. CPU1 CPU2 fput(file) -> __fput(file) -> eventpoll_release(file) -> eventpoll_release_file(file) mutex_lock(&ep->mtx) epi_item_poll() -> epi_fget() -> file_ref_get(file) mutex_unlock(&ep->mtx) mutex_lock(&ep->mtx); __ep_remove() mutex_unlock(&ep->mtx); -> kmem_cache_free(file) (2) ttm dmabuf This explanation is a bit more involved. A regular dmabuf file stashed the dmabuf in file->private_data and the file in dmabuf->file: file->private_data = dmabuf; dmabuf->file = file; The generic release method of a dmabuf file handles file specific things: f_op->release::dma_buf_file_release() while the generic dentry release method of a dmabuf handles dmabuf freeing including driver specific things: dentry->d_release::dma_buf_release() During ttm dmabuf initialization in ttm_object_device_init() the ttm driver copies the provided struct dma_buf_ops into a private location: struct ttm_object_device { spinlock_t object_lock; struct dma_buf_ops ops; void (*dmabuf_release)(struct dma_buf *dma_buf); struct idr idr; }; ttm_object_device_init(const struct dma_buf_ops *ops) { // copy original dma_buf_ops in private location tdev->ops = *ops; // stash the release method of the original struct dma_buf_ops tdev->dmabuf_release = tdev->ops.release; // override the release method in the copy of the struct dma_buf_ops // with ttm's own dmabuf release method tdev->ops.release = ttm_prime_dmabuf_release; } When a new dmabuf is created the struct dma_buf_ops with the overriden release method set to ttm_prime_dmabuf_release is passed in exp_info.ops: DEFINE_DMA_BUF_EXPORT_INFO(exp_info); exp_info.ops = &tdev->ops; exp_info.size = prime->size; exp_info.flags = flags; exp_info.priv = prime; The call to dma_buf_export() then sets mutex_lock_interruptible(&prime->mutex); dma_buf = dma_buf_export(&exp_info) { dmabuf->ops = exp_info->ops; } mutex_unlock(&prime->mutex); which creates a new dmabuf file and then install a file descriptor to it in the callers file descriptor table: ret = dma_buf_fd(dma_buf, flags); When that dmabuf file is closed we now get: fput(file) -> __fput(file) -> f_op->release::dma_buf_file_release() -> dput() -> d_op->d_release::dma_buf_release() -> dmabuf->ops->release::ttm_prime_dmabuf_release() mutex_lock(&prime->mutex); if (prime->dma_buf == dma_buf) prime->dma_buf = NULL; mutex_unlock(&prime->mutex); Where we can see that prime->dma_buf is set to NULL. So when we have the following diagram: CPU1 CPU2 fput(file) -> __fput(file) -> f_op->release::dma_buf_file_release() -> dput() -> d_op->d_release::dma_buf_release() -> dmabuf->ops->release::ttm_prime_dmabuf_release() ttm_prime_handle_to_fd() mutex_lock_interruptible(&prime->mutex) dma_buf = prime->dma_buf dma_buf && get_dma_buf_unless_doomed(dma_buf) -> file_ref_get(dma_buf->file) mutex_unlock(&prime->mutex); mutex_lock(&prime->mutex); if (prime->dma_buf == dma_buf) prime->dma_buf = NULL; mutex_unlock(&prime->mutex); -> kmem_cache_free(file) The logic of the mechanism is the same as for epoll: sync with __fput() preventing the file from being freed. Here the synchronization happens through the ttm instance's prime->mutex. Basically, the lifetime of the dma_buf and the file are tighly coupled. Both (1) and (2) used to call atomic_inc_not_zero() to check whether the file has already been marked dead and then refuse to revive it. This is only safe because both (1) and (2) sync with __fput() and thus prevent kmem_cache_free() on the file being called and thus prevent the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU. Both (1) and (2) have been ported from atomic_inc_not_zero() to file_ref_get(). That means a file that is already in the process of being marked as FILE_REF_DEAD: file_ref_put() cnt = atomic_long_dec_return() -> __file_ref_put(cnt) if (cnt == FIlE_REF_NOREF) atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD) can be revived again: CPU1 CPU2 file_ref_put() cnt = atomic_long_dec_return() -> __file_ref_put(cnt) if (cnt == FIlE_REF_NOREF) file_ref_get() // Brings reference back to FILE_REF_ONEREF atomic_long_add_negative() atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD) This is fine and inherent to the file_ref_get()/file_ref_put() semantics. For both (1) and (2) this is safe because __fput() is prevented from making progress if file_ref_get() fails due to the aforementioned synchronization mechanisms. Two cases need to be considered that affect both (1) epoll and (2) ttm dmabuf: (i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but before that fput() can mark the file as FILE_REF_DEAD someone manages to sneak in a file_ref_get() and brings the refcount back from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original fput() doesn't call __fput(). For epoll the poll will finish and for ttm dmabuf the file can be used again. For ttm dambuf this is actually an advantage because it avoids immediately allocating a new dmabuf object. CPU1 CPU2 file_ref_put() cnt = atomic_long_dec_return() -> __file_ref_put(cnt) if (cnt == FIlE_REF_NOREF) file_ref_get() // Brings reference back to FILE_REF_ONEREF atomic_long_add_negative() atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD) (ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and also suceeds in actually marking it FILE_REF_DEAD and then calls into __fput() to free the file. When either (1) or (2) call file_ref_get() they fail as atomic_long_add_negative() will return true. At the same time, both (1) and (2) all file_ref_get() under mutexes that __fput() must also acquire preventing kmem_cache_free() from freeing the file. So while this might be treated as a change in semantics for (1) and (2) it really isn't. It if should end up causing issues this can be fixed by adding a helper that does something like: long cnt = atomic_long_read(&ref->refcnt); do { if (cnt < 0) return false; } while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1)); return true; which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions. - Jann correctly pointed out that kmem_cache_zalloc() cannot be used anymore once files have been ported to file_ref_t. The kmem_cache_zalloc() call will memset() the whole struct file to zero when it is reallocated. This will also set file->f_ref to zero which mens that a concurrent file_ref_get() can return true: CPU1 CPU2 __get_file_rcu() rcu_dereference_raw() close() [frees file] alloc_empty_file() kmem_cache_zalloc() [reallocates same file] memset(..., 0, ...) file_ref_get() [increments 0->1, returns true] init_file() file_ref_init(..., 1) [sets to 0] rcu_dereference_raw() fput() file_ref_put() [decrements 0->FILE_REF_NOREF, frees file] [UAF] causing a concurrent __get_file_rcu() call to acquire a reference to the file that is about to be reallocated and immediately freeing it on realizing that it has been recycled. This causes a UAF for the task that reallocated/recycled the file. This is prevented by switching from kmem_cache_zalloc() to kmem_cache_alloc() and initializing the fields manually. With file->f_ref initialized last. Note that a memset() also isn't guaranteed to atomically update an unsigned long so it's theoretically possible to see torn and therefore bogus counter values. Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337..c13f648a1c13 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -1005,7 +1006,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) /** * struct file - Represents a file - * @f_count: reference count + * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations @@ -1030,7 +1031,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) */ struct file { - atomic_long_t f_count; + file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; @@ -1078,15 +1079,14 @@ struct file_handle { static inline struct file *get_file(struct file *f) { - long prior = atomic_long_fetch_inc_relaxed(&f->f_count); - WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n"); + file_ref_inc(&f->f_ref); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); -#define file_count(x) atomic_long_read(&(x)->f_count) +#define file_count(f) file_ref_read(&(f)->f_ref) #define MAX_NON_LFS ((1UL<<31) - 1) -- cgit v1.2.3 From 95b6235e36953e9d6c822270961e79751eee2276 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 28 Oct 2024 17:58:38 +0000 Subject: iommu: Make bus_iommu_probe() static With the last external caller of bus_iommu_probe() now gone, make it internal as it really should be. Signed-off-by: Robin Murphy Tested-by: H. Nikolaus Schaller Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Tested-by: Beleswar Padhi Link: https://lore.kernel.org/r/a7511a034a27259aff4e14d80a861d3c40fbff1e.1730136799.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bd722f473635..84a6ed5e803a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -784,7 +784,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) }; } -extern int bus_iommu_probe(const struct bus_type *bus); extern bool iommu_present(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_group_has_isolated_msi(struct iommu_group *group); -- cgit v1.2.3 From 133008e84b99e4f5f8cf3d8b768c995732df9406 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 16 Oct 2024 13:13:09 -0700 Subject: blk-integrity: remove seed for user mapped buffers The seed is only used for kernel generation and verification. That doesn't happen for user buffers, so passing the seed around doesn't accomplish anything. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20241016201309.1090320-1-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 4 ++-- include/linux/blk-integrity.h | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index dd831c269e99..dbf0f74c1529 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -72,7 +72,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -99,7 +99,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) } static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len, u32 seed) + ssize_t len) { return -EINVAL; } diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 676f8f860c47..c7eae0bfb013 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -28,7 +28,7 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, - ssize_t bytes, u32 seed); + ssize_t bytes); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -104,8 +104,7 @@ static inline int blk_rq_map_integrity_sg(struct request *q, } static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, - ssize_t bytes, - u32 seed) + ssize_t bytes) { return -EINVAL; } -- cgit v1.2.3 From 899f44531fe6cac4b024710fec647ecc127724b8 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Wed, 30 Oct 2024 18:25:10 +0530 Subject: pmdomain: core: Add GENPD_FLAG_DEV_NAME_FW flag Introduce GENPD_FLAG_DEV_NAME_FW flag which instructs genpd to generate an unique device name using ida. It is aimed to be used by genpd providers which derive their names directly from FW making them susceptible to debugfs node creation failures. Reported-by: Johan Hovold Closes: https://lore.kernel.org/lkml/ZoQjAWse2YxwyRJv@hovoldconsulting.com/ Fixes: 718072ceb211 ("PM: domains: create debugfs nodes when adding power domains") Suggested-by: Ulf Hansson Suggested-by: Dmitry Baryshkov Signed-off-by: Sibi Sankar Cc: stable@vger.kernel.org Message-ID: <20241030125512.2884761-5-quic_sibis@quicinc.com> Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index b637ec14025f..cf4b11be3709 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -92,6 +92,10 @@ struct dev_pm_domain_list { * GENPD_FLAG_OPP_TABLE_FW: The genpd provider supports performance states, * but its corresponding OPP tables are not * described in DT, but are given directly by FW. + * + * GENPD_FLAG_DEV_NAME_FW: Instructs genpd to generate an unique device name + * using ida. It is used by genpd providers which + * get their genpd-names directly from FW. */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) @@ -101,6 +105,7 @@ struct dev_pm_domain_list { #define GENPD_FLAG_RPM_ALWAYS_ON (1U << 5) #define GENPD_FLAG_MIN_RESIDENCY (1U << 6) #define GENPD_FLAG_OPP_TABLE_FW (1U << 7) +#define GENPD_FLAG_DEV_NAME_FW (1U << 8) enum gpd_status { GENPD_STATE_ON = 0, /* PM domain is on */ @@ -163,6 +168,7 @@ struct generic_pm_domain { atomic_t sd_count; /* Number of subdomains with power "on" */ enum gpd_status status; /* Current state of the domain */ unsigned int device_count; /* Number of devices */ + unsigned int device_id; /* unique device id */ unsigned int suspended_count; /* System suspend device counter */ unsigned int prepared_count; /* Suspend counter of prepared devices */ unsigned int performance_state; /* Aggregated max performance state */ -- cgit v1.2.3 From b35108a51cf7bab58d7eace1267d7965978bcdb8 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 30 Oct 2024 17:47:35 +0000 Subject: jiffies: Define secs_to_jiffies() secs_to_jiffies() is defined in hci_event.c and cannot be reused by other call sites. Hoist it into the core code to allow conversion of the ~1150 usages of msecs_to_jiffies() that either: - use a multiplier value of 1000 or equivalently MSEC_PER_SEC, or - have timeouts that are denominated in seconds (i.e. end in 000) It's implemented as a macro to allow usage in static initializers. This will also allow conversion of yet more sites that use (sec * HZ) directly, and improve their readability. Suggested-by: Michael Kelley Signed-off-by: Easwar Hariharan Signed-off-by: Thomas Gleixner Reviewed-by: Luiz Augusto von Dentz Link: https://lore.kernel.org/all/20241030-open-coded-timeouts-v3-1-9ba123facf88@linux.microsoft.com --- include/linux/jiffies.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 5d21dacd62bc..ed945f42e064 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -526,6 +526,19 @@ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) } } +/** + * secs_to_jiffies: - convert seconds to jiffies + * @_secs: time in seconds + * + * Conversion is done by simple multiplication with HZ + * + * secs_to_jiffies() is defined as a macro rather than a static inline + * function so it can be used in static initializers. + * + * Return: jiffies value + */ +#define secs_to_jiffies(_secs) ((_secs) * HZ) + extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) -- cgit v1.2.3 From d9d959c36bec59f11c0eb6b5308729e3c4901b5e Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Wed, 30 Oct 2024 12:27:34 +0100 Subject: PCI: Make pcim_request_all_regions() a public function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to remove the deprecated function pcim_iomap_regions_request_all(), a few drivers need an interface to request all BARs a PCI device offers. Make pcim_request_all_regions() a public interface. Link: https://lore.kernel.org/r/20241030112743.104395-2-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas Reviewed-by: Damien Le Moal Reviewed-by: Ilpo Järvinen --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 573b4c4c2be6..3b151c8331e5 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2293,6 +2293,7 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev) { } #endif +int pcim_request_all_regions(struct pci_dev *pdev, const char *name); void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen); void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar, const char *name); -- cgit v1.2.3 From 6d9c59212523e719a63575222f1cd1b0aca3da4f Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Wed, 30 Oct 2024 12:27:43 +0100 Subject: PCI: Remove pcim_iomap_regions_request_all() pcim_iomap_regions_request_all() have been deprecated in commit e354bb84a4c1 ("PCI: Deprecate pcim_iomap_table(), pcim_iomap_regions_request_all()"). All users of this function have been ported to other interfaces by now. Remove pcim_iomap_regions_request_all(). Link: https://lore.kernel.org/r/20241030112743.104395-11-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas Reviewed-by: Damien Le Moal --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 3b151c8331e5..b59197635c5c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2301,8 +2301,6 @@ void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr); void __iomem * const *pcim_iomap_table(struct pci_dev *pdev); int pcim_request_region(struct pci_dev *pdev, int bar, const char *name); int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name); -int pcim_iomap_regions_request_all(struct pci_dev *pdev, int mask, - const char *name); void pcim_iounmap_regions(struct pci_dev *pdev, int mask); void __iomem *pcim_iomap_range(struct pci_dev *pdev, int bar, unsigned long offset, unsigned long len); -- cgit v1.2.3 From 4a6afd60733c75369680f4a40c82b7c8528f4a7a Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Wed, 16 Oct 2024 11:49:04 +0200 Subject: PCI: Make pcim_iounmap_region() a public function The function pcim_iounmap_regions() is problematic because it uses a bitmask mechanism to release / iounmap multiple BARs at once. It, thus, prevents getting rid of the problematic iomap table mechanism which was deprecated in commit e354bb84a4c1 ("PCI: Deprecate pcim_iomap_table(), pcim_iomap_regions_request_all()"). pcim_iounmap_region() does not have that problem. Make it public as the successor of pcim_iounmap_regions(). Link: https://lore.kernel.org/r/20241016094911.24818-3-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index b59197635c5c..b8d248c136ba 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2297,6 +2297,7 @@ int pcim_request_all_regions(struct pci_dev *pdev, const char *name); void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen); void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar, const char *name); +void pcim_iounmap_region(struct pci_dev *pdev, int bar); void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr); void __iomem * const *pcim_iomap_table(struct pci_dev *pdev); int pcim_request_region(struct pci_dev *pdev, int bar, const char *name); -- cgit v1.2.3 From 3e7f43188ee227bcf0f07f60a00f1fd1aca10e6a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:01:36 -0700 Subject: KVM: Protect vCPU's "last run PID" with rwlock, not RCU To avoid jitter on KVM_RUN due to synchronize_rcu(), use a rwlock instead of RCU to protect vcpu->pid, a.k.a. the pid of the task last used to a vCPU. When userspace is doing M:N scheduling of tasks to vCPUs, e.g. to run SEV migration helper vCPUs during post-copy, the synchronize_rcu() needed to change the PID associated with the vCPU can stall for hundreds of milliseconds, which is problematic for latency sensitive post-copy operations. In the directed yield path, do not acquire the lock if it's contended, i.e. if the associated PID is changing, as that means the vCPU's task is already running. Reported-by: Steve Rutherford Reviewed-by: Steve Rutherford Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20240802200136.329973-3-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 02f0206fd2dc..18a1672ffcbf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -334,7 +334,8 @@ struct kvm_vcpu { #ifndef __KVM_HAVE_ARCH_WQP struct rcuwait wait; #endif - struct pid __rcu *pid; + struct pid *pid; + rwlock_t pid_lock; int sigset_active; sigset_t sigset; unsigned int halt_poll_ns; -- cgit v1.2.3 From 9e9af8bbb5f9b565b9faf691f96f661791e199b0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 10 Oct 2024 07:26:04 -0700 Subject: perf/x86/rapl: Clean up cpumask and hotplug The rapl pmu is die scope, which is supported by the generic perf_event subsystem now. Set the scope for the rapl PMU and remove all the cpumask and hotplug codes. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Oliver Sang Tested-by: Dhananjay Ugwekar Link: https://lore.kernel.org/r/20241010142604.770192-2-kan.liang@linux.intel.com --- include/linux/cpuhotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2361ed4d2b15..37a9afffb59e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -208,7 +208,6 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, - CPUHP_AP_PERF_X86_RAPL_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, -- cgit v1.2.3 From dd1a7567784e2b1f80258be04f57bcfa82c997eb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 23 Oct 2024 21:41:59 -0700 Subject: uprobes: SRCU-protect uretprobe lifetime (with timeout) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid taking refcount on uprobe in prepare_uretprobe(), instead take uretprobe-specific SRCU lock and keep it active as kernel transfers control back to user space. Given we can't rely on user space returning from traced function within reasonable time period, we need to make sure not to keep SRCU lock active for too long, though. To that effect, we employ a timer callback which is meant to terminate SRCU lock region after predefined timeout (currently set to 100ms), and instead transfer underlying struct uprobe's lifetime protection to refcounting. This fallback to less scalable refcounting after 100ms is a fine tradeoff from uretprobe's scalability and performance perspective, because uretprobing *long running* user functions inherently doesn't run into scalability issues (there is just not enough frequency to cause noticeable issues with either performance or scalability). The overall trick is in ensuring synchronization between current thread and timer's callback fired on some other thread. To cope with that with minimal logic complications, we add hprobe wrapper which is used to contain all the synchronization related issues behind a small number of basic helpers: hprobe_expire() for "downgrading" uprobe from SRCU-protected state to refcounted state, and a hprobe_consume() and hprobe_finalize() pair of single-use consuming helpers. Other than that, whatever current thread's logic is there stays the same, as timer thread cannot modify return_instance state (or add new/remove old return_instances). It only takes care of SRCU unlock and uprobe refcounting, which is hidden from the higher-level uretprobe handling logic. We use atomic xchg() in hprobe_consume(), which is called from performance critical handle_uretprobe_chain() function run in the current context. When uncontended, this xchg() doesn't seem to hurt performance as there are no other competing CPUs fighting for the same cache line. We also mark struct return_instance as ____cacheline_aligned to ensure no false sharing can happen. Another technical moment. We need to make sure that the list of return instances can be safely traversed under RCU from timer callback, so we delay return_instance freeing with kfree_rcu() and make sure that list modifications use RCU-aware operations. Also, given SRCU lock survives transition from kernel to user space and back we need to use lower-level __srcu_read_lock() and __srcu_read_unlock() to avoid lockdep complaining. Just to give an impression of a kind of performance improvements this change brings, below are benchmarking results with and without these SRCU changes, assuming other uprobe optimizations (mainly RCU Tasks Trace for entry uprobes, lockless RB-tree lookup, and lockless VMA to uprobe lookup) are left intact: WITHOUT SRCU for uretprobes =========================== uretprobe-nop ( 1 cpus): 2.197 ± 0.002M/s ( 2.197M/s/cpu) uretprobe-nop ( 2 cpus): 3.325 ± 0.001M/s ( 1.662M/s/cpu) uretprobe-nop ( 3 cpus): 4.129 ± 0.002M/s ( 1.376M/s/cpu) uretprobe-nop ( 4 cpus): 6.180 ± 0.003M/s ( 1.545M/s/cpu) uretprobe-nop ( 8 cpus): 7.323 ± 0.005M/s ( 0.915M/s/cpu) uretprobe-nop (16 cpus): 6.943 ± 0.005M/s ( 0.434M/s/cpu) uretprobe-nop (32 cpus): 5.931 ± 0.014M/s ( 0.185M/s/cpu) uretprobe-nop (64 cpus): 5.145 ± 0.003M/s ( 0.080M/s/cpu) uretprobe-nop (80 cpus): 4.925 ± 0.005M/s ( 0.062M/s/cpu) WITH SRCU for uretprobes ======================== uretprobe-nop ( 1 cpus): 1.968 ± 0.001M/s ( 1.968M/s/cpu) uretprobe-nop ( 2 cpus): 3.739 ± 0.003M/s ( 1.869M/s/cpu) uretprobe-nop ( 3 cpus): 5.616 ± 0.003M/s ( 1.872M/s/cpu) uretprobe-nop ( 4 cpus): 7.286 ± 0.002M/s ( 1.822M/s/cpu) uretprobe-nop ( 8 cpus): 13.657 ± 0.007M/s ( 1.707M/s/cpu) uretprobe-nop (32 cpus): 45.305 ± 0.066M/s ( 1.416M/s/cpu) uretprobe-nop (64 cpus): 42.390 ± 0.922M/s ( 0.662M/s/cpu) uretprobe-nop (80 cpus): 47.554 ± 2.411M/s ( 0.594M/s/cpu) Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241024044159.3156646-3-andrii@kernel.org --- include/linux/uprobes.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index dbaf04189548..7a051b5d2edd 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -15,6 +15,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -67,6 +68,53 @@ enum uprobe_task_state { UTASK_SSTEP_TRAPPED, }; +/* The state of hybrid-lifetime uprobe inside struct return_instance */ +enum hprobe_state { + HPROBE_LEASED, /* uretprobes_srcu-protected uprobe */ + HPROBE_STABLE, /* refcounted uprobe */ + HPROBE_GONE, /* NULL uprobe, SRCU expired, refcount failed */ + HPROBE_CONSUMED, /* uprobe "consumed" by uretprobe handler */ +}; + +/* + * Hybrid lifetime uprobe. Represents a uprobe instance that could be either + * SRCU protected (with SRCU protection eventually potentially timing out), + * refcounted using uprobe->ref, or there could be no valid uprobe (NULL). + * + * hprobe's internal state is setup such that background timer thread can + * atomically "downgrade" temporarily RCU-protected uprobe into refcounted one + * (or no uprobe, if refcounting failed). + * + * *stable* pointer always point to the uprobe (or could be NULL if there is + * was no valid underlying uprobe to begin with). + * + * *leased* pointer is the key to achieving race-free atomic lifetime state + * transition and can have three possible states: + * - either the same non-NULL value as *stable*, in which case uprobe is + * SRCU-protected; + * - NULL, in which case uprobe (if there is any) is refcounted; + * - special __UPROBE_DEAD value, which represents an uprobe that was SRCU + * protected initially, but SRCU period timed out and we attempted to + * convert it to refcounted, but refcount_inc_not_zero() failed, because + * uprobe effectively went away (the last consumer unsubscribed). In this + * case it's important to know that *stable* pointer (which still has + * non-NULL uprobe pointer) shouldn't be used, because lifetime of + * underlying uprobe is not guaranteed anymore. __UPROBE_DEAD is just an + * internal marker and is handled transparently by hprobe_fetch() helper. + * + * When uprobe is SRCU-protected, we also record srcu_idx value, necessary for + * SRCU unlocking. + * + * See hprobe_expire() and hprobe_fetch() for details of race-free uprobe + * state transitioning details. It all hinges on atomic xchg() over *leaded* + * pointer. *stable* pointer, once initially set, is not modified concurrently. + */ +struct hprobe { + enum hprobe_state state; + int srcu_idx; + struct uprobe *uprobe; +}; + /* * uprobe_task: Metadata of a task while it singlesteps. */ @@ -86,6 +134,7 @@ struct uprobe_task { }; struct uprobe *active_uprobe; + struct timer_list ri_timer; unsigned long xol_vaddr; struct arch_uprobe *auprobe; @@ -100,7 +149,7 @@ struct return_consumer { }; struct return_instance { - struct uprobe *uprobe; + struct hprobe hprobe; unsigned long func; unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ @@ -108,9 +157,10 @@ struct return_instance { int consumers_cnt; struct return_instance *next; /* keep as stack */ + struct rcu_head rcu; struct return_consumer consumers[] __counted_by(consumers_cnt); -}; +} ____cacheline_aligned; enum rp_check { RP_CHECK_CALL, -- cgit v1.2.3 From f247fd22e9f27d919d81861e4dcc438e0b6d179b Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 23 Oct 2024 08:56:00 +0200 Subject: s390/time: Add clocksource id to TOD clock To allow specifying the clock source in the upcoming PtP driver, add a clocksource ID to the s390 TOD clock. Acked-by: Heiko Carstens Acked-by: Richard Cochran Signed-off-by: Sven Schnelle Link: https://patch.msgid.link/20241023065601.449586-2-svens@linux.ibm.com Signed-off-by: Jakub Kicinski --- include/linux/clocksource_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h index 2bb4d8c2f1b0..c4ef4ae2eded 100644 --- a/include/linux/clocksource_ids.h +++ b/include/linux/clocksource_ids.h @@ -6,6 +6,7 @@ enum clocksource_ids { CSID_GENERIC = 0, CSID_ARM_ARCH_COUNTER, + CSID_S390_TOD, CSID_X86_TSC_EARLY, CSID_X86_TSC, CSID_X86_KVM_CLK, -- cgit v1.2.3 From 3b1596a21fbf210f5b763fd3c0be280650475b52 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Oct 2024 13:54:43 +0100 Subject: clockevents: Shutdown and unregister current clockevents at CPUHP_AP_TICK_DYING The way the clockevent devices are finally stopped while a CPU is offlining is currently chaotic. The layout being by order: 1) tick_sched_timer_dying() stops the tick and the underlying clockevent but only for oneshot case. The periodic tick and its related clockevent still runs. 2) tick_broadcast_offline() detaches and stops the per-cpu oneshot broadcast and append it to the released list. 3) Some individual clockevent drivers stop the clockevents (a second time if the tick is oneshot) 4) Once the CPU is dead, a control CPU remotely detaches and stops (a 3rd time if oneshot mode) the CPU clockevent and adds it to the released list. 5) The released list containing the broadcast device released on step 2) and the remotely detached clockevent from step 4) are unregistered. These random events can be factorized if the current clockevent is detached and stopped by the dying CPU at the generic layer, that is from the dying CPU: a) Stop the tick b) Stop/detach the underlying per-cpu oneshot broadcast clockevent c) Stop/detach the underlying clockevent d) Release / unregister the clockevents from b) and c) e) Release / unregister the remaining clockevents from the dying CPU. This part could be performed by the dying CPU This way the drivers and the tick layer don't need to care about clockevent operations during cpuhotplug down. This also unifies the tick behaviour on offline CPUs between oneshot and periodic modes, avoiding offline ticks altogether for sanity. Adopt the simplification. [ tglx: Remove the WARN_ON() in clockevents_register_device() as that is called from an upcoming CPU before the CPU is marked online ] Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241029125451.54574-3-frederic@kernel.org --- include/linux/tick.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 72744638c5b0..b0c74bfe0600 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -20,12 +20,10 @@ extern void __init tick_init(void); extern void tick_suspend_local(void); /* Should be core only, but XEN resume magic and ARM BL switcher require it */ extern void tick_resume_local(void); -extern void tick_cleanup_dead_cpu(int cpu); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } -static inline void tick_cleanup_dead_cpu(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_HOTPLUG_CPU) -- cgit v1.2.3 From 697a4001d31a607a72c6297e4eb0f7918c6e6929 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Mon, 7 Oct 2024 01:14:05 +0100 Subject: mfd: axp20x: Ensure relationship between IDs and model names At the moment there is an implicit relationship between the AXP model IDs and the order of the strings in the axp20x_model_names[] array. This is fragile, and makes adding IDs in the middle error prone. Make this relationship official by changing the ID type to the actual enum used, and using indexed initialisers for the string list. Signed-off-by: Andre Przywara Reviewed-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20241007001408.27249-3-andre.przywara@arm.com Signed-off-by: Lee Jones --- include/linux/mfd/axp20x.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index f4dfc1871a95..79ecaaaa2070 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -959,7 +959,7 @@ struct axp20x_dev { unsigned long irq_flags; struct regmap *regmap; struct regmap_irq_chip_data *regmap_irqc; - long variant; + enum axp20x_variants variant; int nr_cells; const struct mfd_cell *cells; const struct regmap_config *regmap_cfg; -- cgit v1.2.3 From 35fec94afe045856456faca4879b9c560e39d1e3 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Mon, 7 Oct 2024 01:14:07 +0100 Subject: mfd: axp20x: Add support for AXP323 The X-Powers AXP323 is a very close sibling of the AXP313A. The only difference seems to be the ability to dual-phase the first two DC/DC converter, which adds another register. Add the required boilerplate to introduce a new PMIC to the AXP MFD driver. Where possible, this just maps into the existing structs defined for the AXP313A, only deviating where needed. Signed-off-by: Andre Przywara Reviewed-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20241007001408.27249-5-andre.przywara@arm.com Signed-off-by: Lee Jones --- include/linux/mfd/axp20x.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index 79ecaaaa2070..c3df0e615fbf 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -19,6 +19,7 @@ enum axp20x_variants { AXP223_ID, AXP288_ID, AXP313A_ID, + AXP323_ID, AXP717_ID, AXP803_ID, AXP806_ID, @@ -113,6 +114,7 @@ enum axp20x_variants { #define AXP313A_SHUTDOWN_CTRL 0x1a #define AXP313A_IRQ_EN 0x20 #define AXP313A_IRQ_STATE 0x21 +#define AXP323_DCDC_MODE_CTRL2 0x22 #define AXP717_ON_INDICATE 0x00 #define AXP717_PMU_STATUS_2 0x01 -- cgit v1.2.3 From 6e31bb8d3a63bb2c3efab2fb6bcfccac677a4581 Mon Sep 17 00:00:00 2001 From: Yassine Oudjana Date: Fri, 18 Oct 2024 11:10:46 +0300 Subject: mfd: mt6397: Add initial support for MT6328 The MT6328 PMIC is commonly used with the MT6735 SoC. Add initial support for this PMIC. Signed-off-by: Yassine Oudjana Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20241018081050.23592-5-y.oudjana@protonmail.com Signed-off-by: Lee Jones --- include/linux/mfd/mt6328/core.h | 53 +++ include/linux/mfd/mt6328/registers.h | 822 +++++++++++++++++++++++++++++++++++ include/linux/mfd/mt6397/core.h | 11 +- 3 files changed, 881 insertions(+), 5 deletions(-) create mode 100644 include/linux/mfd/mt6328/core.h create mode 100644 include/linux/mfd/mt6328/registers.h (limited to 'include/linux') diff --git a/include/linux/mfd/mt6328/core.h b/include/linux/mfd/mt6328/core.h new file mode 100644 index 000000000000..9a08aed72b9f --- /dev/null +++ b/include/linux/mfd/mt6328/core.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2015 MediaTek Inc. + * Copyright (c) 2022 Yassine Oudjana + */ + +#ifndef __MFD_MT6328_CORE_H__ +#define __MFD_MT6328_CORE_H__ + +enum mt6328_irq_status_numbers { + MT6328_IRQ_STATUS_PWRKEY = 0, + MT6328_IRQ_STATUS_HOMEKEY, + MT6328_IRQ_STATUS_PWRKEY_R, + MT6328_IRQ_STATUS_HOMEKEY_R, + MT6328_IRQ_STATUS_THR_H, + MT6328_IRQ_STATUS_THR_L, + MT6328_IRQ_STATUS_BAT_H, + MT6328_IRQ_STATUS_BAT_L, + MT6328_IRQ_STATUS_RTC, + MT6328_IRQ_STATUS_AUDIO, + MT6328_IRQ_STATUS_ACCDET, + MT6328_IRQ_STATUS_ACCDET_EINT, + MT6328_IRQ_STATUS_ACCDET_NEGV, + MT6328_IRQ_STATUS_NI_LBAT_INT, + MT6328_IRQ_STATUS_VPROC_OC = 16, + MT6328_IRQ_STATUS_VSYS_OC, + MT6328_IRQ_STATUS_VLTE_OC, + MT6328_IRQ_STATUS_VCORE_OC, + MT6328_IRQ_STATUS_VPA_OC, + MT6328_IRQ_STATUS_LDO_OC, + MT6328_IRQ_STATUS_BAT2_H, + MT6328_IRQ_STATUS_BAT2_L, + MT6328_IRQ_STATUS_VISMPS0_H, + MT6328_IRQ_STATUS_VISMPS0_L, + MT6328_IRQ_STATUS_AUXADC_IMP, + MT6328_IRQ_STATUS_OV = 32, + MT6328_IRQ_STATUS_BVALID_DET, + MT6328_IRQ_STATUS_VBATON_HV, + MT6328_IRQ_STATUS_VBATON_UNDET, + MT6328_IRQ_STATUS_WATCHDOG, + MT6328_IRQ_STATUS_PCHR_CM_VDEC, + MT6328_IRQ_STATUS_CHRDET, + MT6328_IRQ_STATUS_PCHR_CM_VINC, + MT6328_IRQ_STATUS_FG_BAT_H, + MT6328_IRQ_STATUS_FG_BAT_L, + MT6328_IRQ_STATUS_FG_CUR_H, + MT6328_IRQ_STATUS_FG_CUR_L, + MT6328_IRQ_STATUS_FG_ZCV, + MT6328_IRQ_STATUS_SPKL_D, + MT6328_IRQ_STATUS_SPKL_AB, +}; + +#endif /* __MFD_MT6323_CORE_H__ */ diff --git a/include/linux/mfd/mt6328/registers.h b/include/linux/mfd/mt6328/registers.h new file mode 100644 index 000000000000..8199aaea27b9 --- /dev/null +++ b/include/linux/mfd/mt6328/registers.h @@ -0,0 +1,822 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2022 Yassine Oudjana + */ + +#ifndef __MFD_MT6328_REGISTERS_H__ +#define __MFD_MT6328_REGISTERS_H__ + +/* PMIC Registers */ +#define MT6328_STRUP_CON0 0x0000 +#define MT6328_STRUP_CON2 0x0002 +#define MT6328_STRUP_CON3 0x0004 +#define MT6328_STRUP_CON4 0x0006 +#define MT6328_STRUP_CON5 0x0008 +#define MT6328_STRUP_CON6 0x000a +#define MT6328_STRUP_CON7 0x000c +#define MT6328_STRUP_CON8 0x000e +#define MT6328_STRUP_CON9 0x0010 +#define MT6328_STRUP_CON10 0x0012 +#define MT6328_STRUP_CON11 0x0014 +#define MT6328_STRUP_CON12 0x0016 +#define MT6328_STRUP_CON13 0x0018 +#define MT6328_STRUP_CON14 0x001a +#define MT6328_STRUP_CON15 0x001c +#define MT6328_STRUP_CON16 0x001e +#define MT6328_STRUP_CON17 0x0020 +#define MT6328_STRUP_CON18 0x0022 +#define MT6328_STRUP_CON19 0x0024 +#define MT6328_STRUP_CON20 0x0026 +#define MT6328_STRUP_CON21 0x0028 +#define MT6328_STRUP_CON22 0x002a +#define MT6328_STRUP_CON23 0x002c +#define MT6328_STRUP_CON24 0x002e +#define MT6328_STRUP_CON25 0x0030 +#define MT6328_STRUP_CON26 0x0032 +#define MT6328_STRUP_CON27 0x0034 +#define MT6328_STRUP_CON28 0x0036 +#define MT6328_STRUP_CON29 0x0038 +#define MT6328_STRUP_CON30 0x003a +#define MT6328_STRUP_CON31 0x003c +#define MT6328_STRUP_CON32 0x003e +#define MT6328_STRUP_ANA_CON0 0x0040 +#define MT6328_HWCID 0x0200 +#define MT6328_SWCID 0x0202 +#define MT6328_TOP_CON 0x0204 +#define MT6328_TEST_OUT 0x0206 +#define MT6328_TEST_CON0 0x0208 +#define MT6328_TEST_CON1 0x020a +#define MT6328_TESTMODE_SW 0x020c +#define MT6328_EN_STATUS0 0x020e +#define MT6328_EN_STATUS1 0x0210 +#define MT6328_EN_STATUS2 0x0212 +#define MT6328_OCSTATUS0 0x0214 +#define MT6328_OCSTATUS1 0x0216 +#define MT6328_OCSTATUS2 0x0218 +#define MT6328_PGDEBSTATUS 0x021a +#define MT6328_PGSTATUS 0x021c +#define MT6328_THERMALSTATUS 0x021e +#define MT6328_TOPSTATUS 0x0220 +#define MT6328_TDSEL_CON 0x0222 +#define MT6328_RDSEL_CON 0x0224 +#define MT6328_SMT_CON0 0x0226 +#define MT6328_SMT_CON1 0x0228 +#define MT6328_SMT_CON2 0x022a +#define MT6328_DRV_CON0 0x022c +#define MT6328_DRV_CON1 0x022e +#define MT6328_DRV_CON2 0x0230 +#define MT6328_DRV_CON3 0x0232 +#define MT6328_TOP_STATUS 0x0234 +#define MT6328_TOP_STATUS_SET 0x0236 +#define MT6328_TOP_STATUS_CLR 0x0238 +#define MT6328_RGS_ANA_MON 0x023a +#define MT6328_TOP_CKPDN_CON0 0x023c +#define MT6328_TOP_CKPDN_CON0_SET 0x023e +#define MT6328_TOP_CKPDN_CON0_CLR 0x0240 +#define MT6328_TOP_CKPDN_CON1 0x0242 +#define MT6328_TOP_CKPDN_CON1_SET 0x0244 +#define MT6328_TOP_CKPDN_CON1_CLR 0x0246 +#define MT6328_TOP_CKPDN_CON2 0x0248 +#define MT6328_TOP_CKPDN_CON2_SET 0x024a +#define MT6328_TOP_CKPDN_CON2_CLR 0x024c +#define MT6328_TOP_CKPDN_CON3 0x024e +#define MT6328_TOP_CKPDN_CON3_SET 0x0250 +#define MT6328_TOP_CKPDN_CON3_CLR 0x0252 +#define MT6328_TOP_CKPDN_CON4 0x0254 +#define MT6328_TOP_CKPDN_CON4_SET 0x0256 +#define MT6328_TOP_CKPDN_CON4_CLR 0x0258 +#define MT6328_TOP_CKSEL_CON0 0x025a +#define MT6328_TOP_CKSEL_CON0_SET 0x025c +#define MT6328_TOP_CKSEL_CON0_CLR 0x025e +#define MT6328_TOP_CKSEL_CON1 0x0260 +#define MT6328_TOP_CKSEL_CON1_SET 0x0262 +#define MT6328_TOP_CKSEL_CON1_CLR 0x0264 +#define MT6328_TOP_CKSEL_CON2 0x0266 +#define MT6328_TOP_CKSEL_CON2_SET 0x0268 +#define MT6328_TOP_CKSEL_CON2_CLR 0x026a +#define MT6328_TOP_CKDIVSEL_CON0 0x026c +#define MT6328_TOP_CKDIVSEL_CON0_SET 0x026e +#define MT6328_TOP_CKDIVSEL_CON0_CLR 0x0270 +#define MT6328_TOP_CKDIVSEL_CON1 0x0272 +#define MT6328_TOP_CKDIVSEL_CON1_SET 0x0274 +#define MT6328_TOP_CKDIVSEL_CON1_CLR 0x0276 +#define MT6328_TOP_CKHWEN_CON0 0x0278 +#define MT6328_TOP_CKHWEN_CON0_SET 0x027a +#define MT6328_TOP_CKHWEN_CON0_CLR 0x027c +#define MT6328_TOP_CKHWEN_CON1 0x027e +#define MT6328_TOP_CKHWEN_CON1_SET 0x0280 +#define MT6328_TOP_CKHWEN_CON1_CLR 0x0282 +#define MT6328_TOP_CKTST_CON0 0x0284 +#define MT6328_TOP_CKTST_CON1 0x0286 +#define MT6328_TOP_CKTST_CON2 0x0288 +#define MT6328_TOP_CLKSQ 0x028a +#define MT6328_TOP_CLKSQ_SET 0x028c +#define MT6328_TOP_CLKSQ_CLR 0x028e +#define MT6328_TOP_CLKSQ_RTC 0x0290 +#define MT6328_TOP_CLKSQ_RTC_SET 0x0292 +#define MT6328_TOP_CLKSQ_RTC_CLR 0x0294 +#define MT6328_TOP_CLK_TRIM 0x0296 +#define MT6328_TOP_RST_CON0 0x0298 +#define MT6328_TOP_RST_CON0_SET 0x029a +#define MT6328_TOP_RST_CON0_CLR 0x029c +#define MT6328_TOP_RST_CON1 0x029e +#define MT6328_TOP_RST_MISC 0x02a0 +#define MT6328_TOP_RST_MISC_SET 0x02a2 +#define MT6328_TOP_RST_MISC_CLR 0x02a4 +#define MT6328_TOP_RST_STATUS 0x02a6 +#define MT6328_TOP_RST_STATUS_SET 0x02a8 +#define MT6328_TOP_RST_STATUS_CLR 0x02aa +#define MT6328_INT_CON0 0x02ac +#define MT6328_INT_CON0_SET 0x02ae +#define MT6328_INT_CON0_CLR 0x02b0 +#define MT6328_INT_CON1 0x02b2 +#define MT6328_INT_CON1_SET 0x02b4 +#define MT6328_INT_CON1_CLR 0x02b6 +#define MT6328_INT_CON2 0x02b8 +#define MT6328_INT_CON2_SET 0x02ba +#define MT6328_INT_CON2_CLR 0x02bc +#define MT6328_INT_MISC_CON 0x02be +#define MT6328_INT_MISC_CON_SET 0x02c0 +#define MT6328_INT_MISC_CON_CLR 0x02c2 +#define MT6328_INT_STATUS0 0x02c4 +#define MT6328_INT_STATUS1 0x02c6 +#define MT6328_INT_STATUS2 0x02c8 +#define MT6328_OC_GEAR_0 0x02ca +#define MT6328_FQMTR_CON0 0x02cc +#define MT6328_FQMTR_CON1 0x02ce +#define MT6328_FQMTR_CON2 0x02d0 +#define MT6328_RG_SPI_CON 0x02d2 +#define MT6328_DEW_DIO_EN 0x02d4 +#define MT6328_DEW_READ_TEST 0x02d6 +#define MT6328_DEW_WRITE_TEST 0x02d8 +#define MT6328_DEW_CRC_SWRST 0x02da +#define MT6328_DEW_CRC_EN 0x02dc +#define MT6328_DEW_CRC_VAL 0x02de +#define MT6328_DEW_DBG_MON_SEL 0x02e0 +#define MT6328_DEW_CIPHER_KEY_SEL 0x02e2 +#define MT6328_DEW_CIPHER_IV_SEL 0x02e4 +#define MT6328_DEW_CIPHER_EN 0x02e6 +#define MT6328_DEW_CIPHER_RDY 0x02e8 +#define MT6328_DEW_CIPHER_MODE 0x02ea +#define MT6328_DEW_CIPHER_SWRST 0x02ec +#define MT6328_DEW_RDDMY_NO 0x02ee +#define MT6328_INT_TYPE_CON0 0x02f0 +#define MT6328_INT_TYPE_CON0_SET 0x02f2 +#define MT6328_INT_TYPE_CON0_CLR 0x02f4 +#define MT6328_INT_TYPE_CON1 0x02f6 +#define MT6328_INT_TYPE_CON1_SET 0x02f8 +#define MT6328_INT_TYPE_CON1_CLR 0x02fa +#define MT6328_INT_TYPE_CON2 0x02fc +#define MT6328_INT_TYPE_CON2_SET 0x02fe +#define MT6328_INT_TYPE_CON2_CLR 0x0300 +#define MT6328_INT_STA 0x0302 +#define MT6328_BUCK_ALL_CON0 0x0400 +#define MT6328_BUCK_ALL_CON1 0x0402 +#define MT6328_BUCK_ALL_CON2 0x0404 +#define MT6328_BUCK_ALL_CON3 0x0406 +#define MT6328_BUCK_ALL_CON4 0x0408 +#define MT6328_BUCK_ALL_CON5 0x040a +#define MT6328_BUCK_ALL_CON6 0x040c +#define MT6328_BUCK_ALL_CON9 0x040e +#define MT6328_BUCK_ALL_CON12 0x0410 +#define MT6328_BUCK_ALL_CON13 0x0412 +#define MT6328_BUCK_ALL_CON14 0x0414 +#define MT6328_BUCK_ALL_CON16 0x0416 +#define MT6328_BUCK_ALL_CON18 0x0418 +#define MT6328_BUCK_ALL_CON19 0x041a +#define MT6328_BUCK_ALL_CON20 0x041c +#define MT6328_BUCK_ALL_CON21 0x041e +#define MT6328_BUCK_ALL_CON22 0x0420 +#define MT6328_BUCK_ALL_CON23 0x0422 +#define MT6328_BUCK_ALL_CON24 0x0424 +#define MT6328_BUCK_ALL_CON25 0x0426 +#define MT6328_BUCK_ALL_CON26 0x0428 +#define MT6328_BUCK_ALL_CON27 0x042a +#define MT6328_BUCK_ALL_CON28 0x042c +#define MT6328_SMPS_TOP_ANA_CON0 0x042e +#define MT6328_SMPS_TOP_ANA_CON1 0x0430 +#define MT6328_SMPS_TOP_ANA_CON2 0x0432 +#define MT6328_SMPS_TOP_ANA_CON3 0x0434 +#define MT6328_SMPS_TOP_ANA_CON4 0x0436 +#define MT6328_SMPS_TOP_ANA_CON5 0x0438 +#define MT6328_SMPS_TOP_ANA_CON6 0x043a +#define MT6328_SMPS_TOP_ANA_CON7 0x043c +#define MT6328_SMPS_TOP_ANA_CON8 0x043e +#define MT6328_VCORE_ANA_CON0 0x0440 +#define MT6328_VCORE_ANA_CON1 0x0442 +#define MT6328_VCORE_ANA_CON2 0x0444 +#define MT6328_VCORE_ANA_CON3 0x0446 +#define MT6328_VCORE_ANA_CON4 0x0448 +#define MT6328_VSYS22_ANA_CON0 0x044a +#define MT6328_VSYS22_ANA_CON1 0x044c +#define MT6328_VSYS22_ANA_CON2 0x044e +#define MT6328_VSYS22_ANA_CON3 0x0450 +#define MT6328_VSYS22_ANA_CON4 0x0452 +#define MT6328_VPROC_ANA_CON0 0x0454 +#define MT6328_VPROC_ANA_CON1 0x0456 +#define MT6328_VPROC_ANA_CON2 0x0458 +#define MT6328_VPROC_ANA_CON3 0x045a +#define MT6328_VPROC_ANA_CON4 0x045c +#define MT6328_OSC32_ANA_CON0 0x045e +#define MT6328_OSC32_ANA_CON1 0x0460 +#define MT6328_VPA_ANA_CON0 0x0462 +#define MT6328_VPA_ANA_CON1 0x0464 +#define MT6328_VPA_ANA_CON2 0x0466 +#define MT6328_VPA_ANA_CON3 0x0468 +#define MT6328_VLTE_ANA_CON0 0x046a +#define MT6328_VLTE_ANA_CON1 0x046c +#define MT6328_VLTE_ANA_CON2 0x046e +#define MT6328_VLTE_ANA_CON3 0x0470 +#define MT6328_VLTE_ANA_CON4 0x0472 +#define MT6328_VPROC_CON0 0x0474 +#define MT6328_VPROC_CON1 0x0476 +#define MT6328_VPROC_CON2 0x0478 +#define MT6328_VPROC_CON3 0x047a +#define MT6328_VPROC_CON4 0x047c +#define MT6328_VPROC_CON5 0x047e +#define MT6328_VPROC_CON6 0x0480 +#define MT6328_VPROC_CON7 0x0482 +#define MT6328_VPROC_CON8 0x0484 +#define MT6328_VPROC_CON9 0x0486 +#define MT6328_VPROC_CON10 0x0488 +#define MT6328_VPROC_CON11 0x048a +#define MT6328_VPROC_CON12 0x048c +#define MT6328_VPROC_CON13 0x048e +#define MT6328_VPROC_CON14 0x0490 +#define MT6328_VPROC_CON15 0x0492 +#define MT6328_VPROC_CON16 0x0494 +#define MT6328_VPROC_CON17 0x0496 +#define MT6328_VPROC_CON18 0x0498 +#define MT6328_VPROC_CON19 0x049a +#define MT6328_VSRAM_CON0 0x049c +#define MT6328_VSRAM_CON1 0x049e +#define MT6328_VSRAM_CON2 0x04a0 +#define MT6328_VSRAM_CON3 0x04a2 +#define MT6328_VSRAM_CON4 0x04a4 +#define MT6328_VSRAM_CON5 0x04a6 +#define MT6328_VSRAM_CON6 0x04a8 +#define MT6328_VSRAM_CON7 0x04aa +#define MT6328_VSRAM_CON8 0x04ac +#define MT6328_VSRAM_CON9 0x04ae +#define MT6328_VSRAM_CON10 0x04b0 +#define MT6328_VSRAM_CON11 0x04b2 +#define MT6328_VSRAM_CON12 0x04b4 +#define MT6328_VSRAM_CON13 0x04b6 +#define MT6328_VSRAM_CON14 0x04b8 +#define MT6328_VSRAM_CON15 0x04ba +#define MT6328_VSRAM_CON16 0x04bc +#define MT6328_VSRAM_CON17 0x04be +#define MT6328_VSRAM_CON18 0x04c0 +#define MT6328_VSRAM_CON19 0x04c2 +#define MT6328_VLTE_CON0 0x04c4 +#define MT6328_VLTE_CON1 0x04c6 +#define MT6328_VLTE_CON2 0x04c8 +#define MT6328_VLTE_CON3 0x04ca +#define MT6328_VLTE_CON4 0x04cc +#define MT6328_VLTE_CON5 0x04ce +#define MT6328_VLTE_CON6 0x04d0 +#define MT6328_VLTE_CON7 0x04d2 +#define MT6328_VLTE_CON8 0x04d4 +#define MT6328_VLTE_CON9 0x04d6 +#define MT6328_VLTE_CON10 0x04d8 +#define MT6328_VLTE_CON11 0x04da +#define MT6328_VLTE_CON12 0x04dc +#define MT6328_VLTE_CON13 0x04de +#define MT6328_VLTE_CON14 0x04e0 +#define MT6328_VLTE_CON15 0x04e2 +#define MT6328_VLTE_CON16 0x04e4 +#define MT6328_VLTE_CON17 0x04e6 +#define MT6328_VLTE_CON18 0x04e8 +#define MT6328_VLTE_CON19 0x04ea +#define MT6328_VCORE1_CON0 0x0600 +#define MT6328_VCORE1_CON1 0x0602 +#define MT6328_VCORE1_CON2 0x0604 +#define MT6328_VCORE1_CON3 0x0606 +#define MT6328_VCORE1_CON4 0x0608 +#define MT6328_VCORE1_CON5 0x060a +#define MT6328_VCORE1_CON6 0x060c +#define MT6328_VCORE1_CON7 0x060e +#define MT6328_VCORE1_CON8 0x0610 +#define MT6328_VCORE1_CON9 0x0612 +#define MT6328_VCORE1_CON10 0x0614 +#define MT6328_VCORE1_CON11 0x0616 +#define MT6328_VCORE1_CON12 0x0618 +#define MT6328_VCORE1_CON13 0x061a +#define MT6328_VCORE1_CON14 0x061c +#define MT6328_VCORE1_CON15 0x061e +#define MT6328_VCORE1_CON16 0x0620 +#define MT6328_VCORE1_CON17 0x0622 +#define MT6328_VCORE1_CON18 0x0624 +#define MT6328_VCORE1_CON19 0x0626 +#define MT6328_VSYS22_CON0 0x0628 +#define MT6328_VSYS22_CON1 0x062a +#define MT6328_VSYS22_CON2 0x062c +#define MT6328_VSYS22_CON3 0x062e +#define MT6328_VSYS22_CON4 0x0630 +#define MT6328_VSYS22_CON5 0x0632 +#define MT6328_VSYS22_CON6 0x0634 +#define MT6328_VSYS22_CON7 0x0636 +#define MT6328_VSYS22_CON8 0x0638 +#define MT6328_VSYS22_CON9 0x063a +#define MT6328_VSYS22_CON10 0x063c +#define MT6328_VSYS22_CON11 0x063e +#define MT6328_VSYS22_CON12 0x0640 +#define MT6328_VSYS22_CON13 0x0642 +#define MT6328_VSYS22_CON14 0x0644 +#define MT6328_VSYS22_CON15 0x0646 +#define MT6328_VSYS22_CON16 0x0648 +#define MT6328_VSYS22_CON17 0x064a +#define MT6328_VSYS22_CON18 0x064c +#define MT6328_VSYS22_CON19 0x064e +#define MT6328_VPA_CON0 0x0650 +#define MT6328_VPA_CON1 0x0652 +#define MT6328_VPA_CON2 0x0654 +#define MT6328_VPA_CON3 0x0656 +#define MT6328_VPA_CON4 0x0658 +#define MT6328_VPA_CON5 0x065a +#define MT6328_VPA_CON6 0x065c +#define MT6328_VPA_CON7 0x065e +#define MT6328_VPA_CON8 0x0660 +#define MT6328_VPA_CON9 0x0662 +#define MT6328_VPA_CON10 0x0664 +#define MT6328_VPA_CON11 0x0666 +#define MT6328_VPA_CON12 0x0668 +#define MT6328_VPA_CON13 0x066a +#define MT6328_VPA_CON14 0x066c +#define MT6328_VPA_CON15 0x066e +#define MT6328_VPA_CON16 0x0670 +#define MT6328_VPA_CON17 0x0672 +#define MT6328_VPA_CON18 0x0674 +#define MT6328_VPA_CON19 0x0676 +#define MT6328_VPA_CON20 0x0678 +#define MT6328_VPA_CON21 0x067a +#define MT6328_VPA_CON22 0x067c +#define MT6328_VPA_CON23 0x067e +#define MT6328_VPA_CON24 0x0680 +#define MT6328_BUCK_K_CON0 0x0682 +#define MT6328_BUCK_K_CON1 0x0684 +#define MT6328_BUCK_K_CON2 0x0686 +#define MT6328_BUCK_K_CON3 0x0688 +#define MT6328_ZCD_CON0 0x0800 +#define MT6328_ZCD_CON1 0x0802 +#define MT6328_ZCD_CON2 0x0804 +#define MT6328_ZCD_CON3 0x0806 +#define MT6328_ZCD_CON4 0x0808 +#define MT6328_ZCD_CON5 0x080a +#define MT6328_ISINK0_CON0 0x080c +#define MT6328_ISINK0_CON1 0x080e +#define MT6328_ISINK0_CON2 0x0810 +#define MT6328_ISINK0_CON3 0x0812 +#define MT6328_ISINK1_CON0 0x0814 +#define MT6328_ISINK1_CON1 0x0816 +#define MT6328_ISINK1_CON2 0x0818 +#define MT6328_ISINK1_CON3 0x081a +#define MT6328_ISINK2_CON1 0x081c +#define MT6328_ISINK3_CON1 0x081e +#define MT6328_ISINK_ANA0 0x0820 +#define MT6328_ISINK_ANA1 0x0822 +#define MT6328_ISINK_PHASE_DLY 0x0824 +#define MT6328_ISINK_SFSTR 0x0826 +#define MT6328_ISINK_EN_CTRL 0x0828 +#define MT6328_ISINK_MODE_CTRL 0x082a +#define MT6328_VTCXO_0_CON0 0x0a00 +#define MT6328_VTCXO_1_CON0 0x0a02 +#define MT6328_VAUD28_CON0 0x0a04 +#define MT6328_VAUX18_CON0 0x0a06 +#define MT6328_VRF18_0_CON0 0x0a08 +#define MT6328_VRF18_0_CON1 0x0a0a +#define MT6328_VCAMA_CON0 0x0a0c +#define MT6328_VCN28_CON0 0x0a0e +#define MT6328_VCN33_CON0 0x0a10 +#define MT6328_VCN33_CON1 0x0a12 +#define MT6328_VCN33_CON2 0x0a14 +#define MT6328_VRF18_1_CON0 0x0a16 +#define MT6328_VRF18_1_CON1 0x0a18 +#define MT6328_VUSB33_CON0 0x0a1a +#define MT6328_VMCH_CON0 0x0a1c +#define MT6328_VMCH_CON1 0x0a1e +#define MT6328_VMC_CON0 0x0a20 +#define MT6328_VMC_CON1 0x0a22 +#define MT6328_VEMC_3V3_CON0 0x0a24 +#define MT6328_VEMC_3V3_CON1 0x0a26 +#define MT6328_VIO28_CON0 0x0a28 +#define MT6328_VCAMAF_CON0 0x0a2a +#define MT6328_VGP1_CON0 0x0a2c +#define MT6328_VGP1_CON1 0x0a2e +#define MT6328_VEFUSE_CON0 0x0a30 +#define MT6328_VSIM1_CON0 0x0a32 +#define MT6328_VSIM2_CON0 0x0a34 +#define MT6328_VIO18_CON0 0x0a36 +#define MT6328_VIBR_CON0 0x0a38 +#define MT6328_VCN18_CON0 0x0a3a +#define MT6328_VCAM_CON0 0x0a3c +#define MT6328_VCAMIO_CON0 0x0a3e +#define MT6328_LDO_VSRAM_CON0 0x0a40 +#define MT6328_LDO_VSRAM_CON1 0x0a42 +#define MT6328_VTREF_CON0 0x0a44 +#define MT6328_VM_CON0 0x0a46 +#define MT6328_VM_CON1 0x0a48 +#define MT6328_VRTC_CON0 0x0a4a +#define MT6328_LDO_OCFB0 0x0a4c +#define MT6328_ALDO_ANA_CON0 0x0a4e +#define MT6328_ADLDO_ANA_CON1 0x0a50 +#define MT6328_ADLDO_ANA_CON2 0x0a52 +#define MT6328_ADLDO_ANA_CON3 0x0a54 +#define MT6328_ADLDO_ANA_CON4 0x0a56 +#define MT6328_ADLDO_ANA_CON5 0x0a58 +#define MT6328_ADLDO_ANA_CON6 0x0a5a +#define MT6328_ADLDO_ANA_CON7 0x0a5c +#define MT6328_ADLDO_ANA_CON8 0x0a5e +#define MT6328_ADLDO_ANA_CON9 0x0a60 +#define MT6328_ADLDO_ANA_CON10 0x0a62 +#define MT6328_ADLDO_ANA_CON11 0x0a64 +#define MT6328_ADLDO_ANA_CON12 0x0a66 +#define MT6328_ADLDO_ANA_CON13 0x0a68 +#define MT6328_DLDO_ANA_CON0 0x0a6a +#define MT6328_DLDO_ANA_CON1 0x0a6c +#define MT6328_DLDO_ANA_CON2 0x0a6e +#define MT6328_DLDO_ANA_CON3 0x0a70 +#define MT6328_DLDO_ANA_CON4 0x0a72 +#define MT6328_DLDO_ANA_CON5 0x0a74 +#define MT6328_SLDO_ANA_CON0 0x0a76 +#define MT6328_SLDO_ANA_CON1 0x0a78 +#define MT6328_SLDO_ANA_CON2 0x0a7a +#define MT6328_SLDO_ANA_CON3 0x0a7c +#define MT6328_SLDO_ANA_CON4 0x0a7e +#define MT6328_SLDO_ANA_CON5 0x0a80 +#define MT6328_SLDO_ANA_CON6 0x0a82 +#define MT6328_SLDO_ANA_CON7 0x0a84 +#define MT6328_SLDO_ANA_CON8 0x0a86 +#define MT6328_SLDO_ANA_CON9 0x0a88 +#define MT6328_SLDO_ANA_CON10 0x0a8a +#define MT6328_LDO_RSV_CON0 0x0a8c +#define MT6328_LDO_RSV_CON1 0x0a8e +#define MT6328_SPK_CON0 0x0a90 +#define MT6328_SPK_CON1 0x0a92 +#define MT6328_SPK_CON2 0x0a94 +#define MT6328_SPK_CON3 0x0a96 +#define MT6328_SPK_CON4 0x0a98 +#define MT6328_SPK_CON5 0x0a9a +#define MT6328_SPK_CON6 0x0a9c +#define MT6328_SPK_CON7 0x0a9e +#define MT6328_SPK_CON8 0x0aa0 +#define MT6328_SPK_CON9 0x0aa2 +#define MT6328_SPK_CON10 0x0aa4 +#define MT6328_SPK_CON11 0x0aa6 +#define MT6328_SPK_CON12 0x0aa8 +#define MT6328_SPK_CON13 0x0aaa +#define MT6328_SPK_CON14 0x0aac +#define MT6328_SPK_CON15 0x0aae +#define MT6328_SPK_CON16 0x0ab0 +#define MT6328_SPK_ANA_CON0 0x0ab2 +#define MT6328_SPK_ANA_CON1 0x0ab4 +#define MT6328_SPK_ANA_CON3 0x0ab6 +#define MT6328_OTP_CON0 0x0c00 +#define MT6328_OTP_CON1 0x0c02 +#define MT6328_OTP_CON2 0x0c04 +#define MT6328_OTP_CON3 0x0c06 +#define MT6328_OTP_CON4 0x0c08 +#define MT6328_OTP_CON5 0x0c0a +#define MT6328_OTP_CON6 0x0c0c +#define MT6328_OTP_CON7 0x0c0e +#define MT6328_OTP_CON8 0x0c10 +#define MT6328_OTP_CON9 0x0c12 +#define MT6328_OTP_CON10 0x0c14 +#define MT6328_OTP_CON11 0x0c16 +#define MT6328_OTP_CON12 0x0c18 +#define MT6328_OTP_CON13 0x0c1a +#define MT6328_OTP_CON14 0x0c1c +#define MT6328_OTP_DOUT_0_15 0x0c1e +#define MT6328_OTP_DOUT_16_31 0x0c20 +#define MT6328_OTP_DOUT_32_47 0x0c22 +#define MT6328_OTP_DOUT_48_63 0x0c24 +#define MT6328_OTP_DOUT_64_79 0x0c26 +#define MT6328_OTP_DOUT_80_95 0x0c28 +#define MT6328_OTP_DOUT_96_111 0x0c2a +#define MT6328_OTP_DOUT_112_127 0x0c2c +#define MT6328_OTP_DOUT_128_143 0x0c2e +#define MT6328_OTP_DOUT_144_159 0x0c30 +#define MT6328_OTP_DOUT_160_175 0x0c32 +#define MT6328_OTP_DOUT_176_191 0x0c34 +#define MT6328_OTP_DOUT_192_207 0x0c36 +#define MT6328_OTP_DOUT_208_223 0x0c38 +#define MT6328_OTP_DOUT_224_239 0x0c3a +#define MT6328_OTP_DOUT_240_255 0x0c3c +#define MT6328_OTP_DOUT_256_271 0x0c3e +#define MT6328_OTP_DOUT_272_287 0x0c40 +#define MT6328_OTP_DOUT_288_303 0x0c42 +#define MT6328_OTP_DOUT_304_319 0x0c44 +#define MT6328_OTP_DOUT_320_335 0x0c46 +#define MT6328_OTP_DOUT_336_351 0x0c48 +#define MT6328_OTP_DOUT_352_367 0x0c4a +#define MT6328_OTP_DOUT_368_383 0x0c4c +#define MT6328_OTP_DOUT_384_399 0x0c4e +#define MT6328_OTP_DOUT_400_415 0x0c50 +#define MT6328_OTP_DOUT_416_431 0x0c52 +#define MT6328_OTP_DOUT_432_447 0x0c54 +#define MT6328_OTP_DOUT_448_463 0x0c56 +#define MT6328_OTP_DOUT_464_479 0x0c58 +#define MT6328_OTP_DOUT_480_495 0x0c5a +#define MT6328_OTP_DOUT_496_511 0x0c5c +#define MT6328_OTP_VAL_0_15 0x0c5e +#define MT6328_OTP_VAL_16_31 0x0c60 +#define MT6328_OTP_VAL_32_47 0x0c62 +#define MT6328_OTP_VAL_48_63 0x0c64 +#define MT6328_OTP_VAL_64_79 0x0c66 +#define MT6328_OTP_VAL_80_95 0x0c68 +#define MT6328_OTP_VAL_96_111 0x0c6a +#define MT6328_OTP_VAL_112_127 0x0c6c +#define MT6328_OTP_VAL_128_143 0x0c6e +#define MT6328_OTP_VAL_144_159 0x0c70 +#define MT6328_OTP_VAL_160_175 0x0c72 +#define MT6328_OTP_VAL_176_191 0x0c74 +#define MT6328_OTP_VAL_192_207 0x0c76 +#define MT6328_OTP_VAL_208_223 0x0c78 +#define MT6328_OTP_VAL_224_239 0x0c7a +#define MT6328_OTP_VAL_240_255 0x0c7c +#define MT6328_OTP_VAL_256_271 0x0c7e +#define MT6328_OTP_VAL_272_287 0x0c80 +#define MT6328_OTP_VAL_288_303 0x0c82 +#define MT6328_OTP_VAL_304_319 0x0c84 +#define MT6328_OTP_VAL_320_335 0x0c86 +#define MT6328_OTP_VAL_336_351 0x0c88 +#define MT6328_OTP_VAL_352_367 0x0c8a +#define MT6328_OTP_VAL_368_383 0x0c8c +#define MT6328_OTP_VAL_384_399 0x0c8e +#define MT6328_OTP_VAL_400_415 0x0c90 +#define MT6328_OTP_VAL_416_431 0x0c92 +#define MT6328_OTP_VAL_432_447 0x0c94 +#define MT6328_OTP_VAL_448_463 0x0c96 +#define MT6328_OTP_VAL_464_479 0x0c98 +#define MT6328_OTP_VAL_480_495 0x0c9a +#define MT6328_OTP_VAL_496_511 0x0c9c +#define MT6328_RTC_MIX_CON0 0x0c9e +#define MT6328_RTC_MIX_CON1 0x0ca0 +#define MT6328_RTC_MIX_CON2 0x0ca2 +#define MT6328_FGADC_CON0 0x0ca4 +#define MT6328_FGADC_CON1 0x0ca6 +#define MT6328_FGADC_CON2 0x0ca8 +#define MT6328_FGADC_CON3 0x0caa +#define MT6328_FGADC_CON4 0x0cac +#define MT6328_FGADC_CON5 0x0cae +#define MT6328_FGADC_CON6 0x0cb0 +#define MT6328_FGADC_CON7 0x0cb2 +#define MT6328_FGADC_CON8 0x0cb4 +#define MT6328_FGADC_CON9 0x0cb6 +#define MT6328_FGADC_CON10 0x0cb8 +#define MT6328_FGADC_CON11 0x0cba +#define MT6328_FGADC_CON12 0x0cbc +#define MT6328_FGADC_CON13 0x0cbe +#define MT6328_FGADC_CON14 0x0cc0 +#define MT6328_FGADC_CON15 0x0cc2 +#define MT6328_FGADC_CON16 0x0cc4 +#define MT6328_FGADC_CON17 0x0cc6 +#define MT6328_FGADC_CON18 0x0cc8 +#define MT6328_FGADC_CON19 0x0cca +#define MT6328_FGADC_CON20 0x0ccc +#define MT6328_FGADC_CON21 0x0cce +#define MT6328_FGADC_CON22 0x0cd0 +#define MT6328_FGADC_CON23 0x0cd2 +#define MT6328_FGADC_CON24 0x0cd4 +#define MT6328_FGADC_CON25 0x0cd6 +#define MT6328_FGADC_CON26 0x0cd8 +#define MT6328_FGADC_CON27 0x0cda +#define MT6328_AUDDEC_ANA_CON0 0x0cdc +#define MT6328_AUDDEC_ANA_CON1 0x0cde +#define MT6328_AUDDEC_ANA_CON2 0x0ce0 +#define MT6328_AUDDEC_ANA_CON3 0x0ce2 +#define MT6328_AUDDEC_ANA_CON4 0x0ce4 +#define MT6328_AUDDEC_ANA_CON5 0x0ce6 +#define MT6328_AUDDEC_ANA_CON6 0x0ce8 +#define MT6328_AUDDEC_ANA_CON7 0x0cea +#define MT6328_AUDDEC_ANA_CON8 0x0cec +#define MT6328_AUDENC_ANA_CON0 0x0cee +#define MT6328_AUDENC_ANA_CON1 0x0cf0 +#define MT6328_AUDENC_ANA_CON2 0x0cf2 +#define MT6328_AUDENC_ANA_CON3 0x0cf4 +#define MT6328_AUDENC_ANA_CON4 0x0cf6 +#define MT6328_AUDENC_ANA_CON5 0x0cf8 +#define MT6328_AUDENC_ANA_CON6 0x0cfa +#define MT6328_AUDENC_ANA_CON7 0x0cfc +#define MT6328_AUDENC_ANA_CON8 0x0cfe +#define MT6328_AUDENC_ANA_CON9 0x0d00 +#define MT6328_AUDENC_ANA_CON10 0x0d02 +#define MT6328_AUDNCP_CLKDIV_CON0 0x0d04 +#define MT6328_AUDNCP_CLKDIV_CON1 0x0d06 +#define MT6328_AUDNCP_CLKDIV_CON2 0x0d08 +#define MT6328_AUDNCP_CLKDIV_CON3 0x0d0a +#define MT6328_AUDNCP_CLKDIV_CON4 0x0d0c +#define MT6328_AUXADC_ADC0 0x0e00 +#define MT6328_AUXADC_ADC1 0x0e02 +#define MT6328_AUXADC_ADC2 0x0e04 +#define MT6328_AUXADC_ADC3 0x0e06 +#define MT6328_AUXADC_ADC4 0x0e08 +#define MT6328_AUXADC_ADC5 0x0e0a +#define MT6328_AUXADC_ADC6 0x0e0c +#define MT6328_AUXADC_ADC7 0x0e0e +#define MT6328_AUXADC_ADC8 0x0e10 +#define MT6328_AUXADC_ADC9 0x0e12 +#define MT6328_AUXADC_ADC10 0x0e14 +#define MT6328_AUXADC_ADC11 0x0e16 +#define MT6328_AUXADC_ADC12 0x0e18 +#define MT6328_AUXADC_ADC13 0x0e1a +#define MT6328_AUXADC_ADC14 0x0e1c +#define MT6328_AUXADC_ADC15 0x0e1e +#define MT6328_AUXADC_ADC16 0x0e20 +#define MT6328_AUXADC_ADC17 0x0e22 +#define MT6328_AUXADC_ADC18 0x0e24 +#define MT6328_AUXADC_ADC19 0x0e26 +#define MT6328_AUXADC_ADC20 0x0e28 +#define MT6328_AUXADC_ADC21 0x0e2a +#define MT6328_AUXADC_ADC22 0x0e2c +#define MT6328_AUXADC_ADC23 0x0e2e +#define MT6328_AUXADC_ADC24 0x0e30 +#define MT6328_AUXADC_ADC25 0x0e32 +#define MT6328_AUXADC_ADC26 0x0e34 +#define MT6328_AUXADC_ADC27 0x0e36 +#define MT6328_AUXADC_ADC28 0x0e38 +#define MT6328_AUXADC_ADC29 0x0e3a +#define MT6328_AUXADC_ADC30 0x0e3c +#define MT6328_AUXADC_ADC31 0x0e3e +#define MT6328_AUXADC_ADC32 0x0e40 +#define MT6328_AUXADC_ADC33 0x0e42 +#define MT6328_AUXADC_BUF0 0x0e44 +#define MT6328_AUXADC_BUF1 0x0e46 +#define MT6328_AUXADC_BUF2 0x0e48 +#define MT6328_AUXADC_BUF3 0x0e4a +#define MT6328_AUXADC_BUF4 0x0e4c +#define MT6328_AUXADC_BUF5 0x0e4e +#define MT6328_AUXADC_BUF6 0x0e50 +#define MT6328_AUXADC_BUF7 0x0e52 +#define MT6328_AUXADC_BUF8 0x0e54 +#define MT6328_AUXADC_BUF9 0x0e56 +#define MT6328_AUXADC_BUF10 0x0e58 +#define MT6328_AUXADC_BUF11 0x0e5a +#define MT6328_AUXADC_BUF12 0x0e5c +#define MT6328_AUXADC_BUF13 0x0e5e +#define MT6328_AUXADC_BUF14 0x0e60 +#define MT6328_AUXADC_BUF15 0x0e62 +#define MT6328_AUXADC_BUF16 0x0e64 +#define MT6328_AUXADC_BUF17 0x0e66 +#define MT6328_AUXADC_BUF18 0x0e68 +#define MT6328_AUXADC_BUF19 0x0e6a +#define MT6328_AUXADC_BUF20 0x0e6c +#define MT6328_AUXADC_BUF21 0x0e6e +#define MT6328_AUXADC_BUF22 0x0e70 +#define MT6328_AUXADC_BUF23 0x0e72 +#define MT6328_AUXADC_BUF24 0x0e74 +#define MT6328_AUXADC_BUF25 0x0e76 +#define MT6328_AUXADC_BUF26 0x0e78 +#define MT6328_AUXADC_BUF27 0x0e7a +#define MT6328_AUXADC_BUF28 0x0e7c +#define MT6328_AUXADC_BUF29 0x0e7e +#define MT6328_AUXADC_BUF30 0x0e80 +#define MT6328_AUXADC_BUF31 0x0e82 +#define MT6328_AUXADC_STA0 0x0e84 +#define MT6328_AUXADC_STA1 0x0e86 +#define MT6328_AUXADC_RQST0 0x0e88 +#define MT6328_AUXADC_RQST0_SET 0x0e8a +#define MT6328_AUXADC_RQST0_CLR 0x0e8c +#define MT6328_AUXADC_RQST1 0x0e8e +#define MT6328_AUXADC_RQST1_SET 0x0e90 +#define MT6328_AUXADC_RQST1_CLR 0x0e92 +#define MT6328_AUXADC_CON0 0x0e94 +#define MT6328_AUXADC_CON0_SET 0x0e96 +#define MT6328_AUXADC_CON0_CLR 0x0e98 +#define MT6328_AUXADC_CON1 0x0e9a +#define MT6328_AUXADC_CON2 0x0e9c +#define MT6328_AUXADC_CON3 0x0e9e +#define MT6328_AUXADC_CON4 0x0ea0 +#define MT6328_AUXADC_CON5 0x0ea2 +#define MT6328_AUXADC_CON6 0x0ea4 +#define MT6328_AUXADC_CON7 0x0ea6 +#define MT6328_AUXADC_CON8 0x0ea8 +#define MT6328_AUXADC_CON9 0x0eaa +#define MT6328_AUXADC_CON10 0x0eac +#define MT6328_AUXADC_CON11 0x0eae +#define MT6328_AUXADC_CON12 0x0eb0 +#define MT6328_AUXADC_CON13 0x0eb2 +#define MT6328_AUXADC_CON14 0x0eb4 +#define MT6328_AUXADC_CON15 0x0eb6 +#define MT6328_AUXADC_CON16 0x0eb8 +#define MT6328_AUXADC_AUTORPT0 0x0eba +#define MT6328_AUXADC_LBAT0 0x0ebc +#define MT6328_AUXADC_LBAT1 0x0ebe +#define MT6328_AUXADC_LBAT2 0x0ec0 +#define MT6328_AUXADC_LBAT3 0x0ec2 +#define MT6328_AUXADC_LBAT4 0x0ec4 +#define MT6328_AUXADC_LBAT5 0x0ec6 +#define MT6328_AUXADC_LBAT6 0x0ec8 +#define MT6328_AUXADC_ACCDET 0x0eca +#define MT6328_AUXADC_THR0 0x0ecc +#define MT6328_AUXADC_THR1 0x0ece +#define MT6328_AUXADC_THR2 0x0ed0 +#define MT6328_AUXADC_THR3 0x0ed2 +#define MT6328_AUXADC_THR4 0x0ed4 +#define MT6328_AUXADC_THR5 0x0ed6 +#define MT6328_AUXADC_THR6 0x0ed8 +#define MT6328_AUXADC_EFUSE0 0x0eda +#define MT6328_AUXADC_EFUSE1 0x0edc +#define MT6328_AUXADC_EFUSE2 0x0ede +#define MT6328_AUXADC_EFUSE3 0x0ee0 +#define MT6328_AUXADC_EFUSE4 0x0ee2 +#define MT6328_AUXADC_EFUSE5 0x0ee4 +#define MT6328_AUXADC_DBG0 0x0ee6 +#define MT6328_AUXADC_IMP0 0x0ee8 +#define MT6328_AUXADC_IMP1 0x0eea +#define MT6328_AUXADC_VISMPS0_1 0x0eec +#define MT6328_AUXADC_VISMPS0_2 0x0eee +#define MT6328_AUXADC_VISMPS0_3 0x0ef0 +#define MT6328_AUXADC_VISMPS0_4 0x0ef2 +#define MT6328_AUXADC_VISMPS0_5 0x0ef4 +#define MT6328_AUXADC_VISMPS0_6 0x0ef6 +#define MT6328_AUXADC_VISMPS0_7 0x0ef8 +#define MT6328_AUXADC_LBAT2_1 0x0efa +#define MT6328_AUXADC_LBAT2_2 0x0efc +#define MT6328_AUXADC_LBAT2_3 0x0efe +#define MT6328_AUXADC_LBAT2_4 0x0f00 +#define MT6328_AUXADC_LBAT2_5 0x0f02 +#define MT6328_AUXADC_LBAT2_6 0x0f04 +#define MT6328_AUXADC_LBAT2_7 0x0f06 +#define MT6328_AUXADC_MDBG_0 0x0f08 +#define MT6328_AUXADC_MDBG_1 0x0f0a +#define MT6328_AUXADC_MDBG_2 0x0f0c +#define MT6328_AUXADC_MDRT_0 0x0f0e +#define MT6328_AUXADC_MDRT_1 0x0f10 +#define MT6328_AUXADC_MDRT_2 0x0f12 +#define MT6328_ACCDET_CON0 0x0f14 +#define MT6328_ACCDET_CON1 0x0f16 +#define MT6328_ACCDET_CON2 0x0f18 +#define MT6328_ACCDET_CON3 0x0f1a +#define MT6328_ACCDET_CON4 0x0f1c +#define MT6328_ACCDET_CON5 0x0f1e +#define MT6328_ACCDET_CON6 0x0f20 +#define MT6328_ACCDET_CON7 0x0f22 +#define MT6328_ACCDET_CON8 0x0f24 +#define MT6328_ACCDET_CON9 0x0f26 +#define MT6328_ACCDET_CON10 0x0f28 +#define MT6328_ACCDET_CON11 0x0f2a +#define MT6328_ACCDET_CON12 0x0f2c +#define MT6328_ACCDET_CON13 0x0f2e +#define MT6328_ACCDET_CON14 0x0f30 +#define MT6328_ACCDET_CON15 0x0f32 +#define MT6328_ACCDET_CON16 0x0f34 +#define MT6328_ACCDET_CON17 0x0f36 +#define MT6328_ACCDET_CON18 0x0f38 +#define MT6328_ACCDET_CON19 0x0f3a +#define MT6328_ACCDET_CON20 0x0f3c +#define MT6328_ACCDET_CON21 0x0f3e +#define MT6328_ACCDET_CON22 0x0f40 +#define MT6328_ACCDET_CON23 0x0f42 +#define MT6328_ACCDET_CON24 0x0f44 +#define MT6328_ACCDET_CON25 0x0f46 +#define MT6328_CHR_CON0 0x0f48 +#define MT6328_CHR_CON1 0x0f4a +#define MT6328_CHR_CON2 0x0f4c +#define MT6328_CHR_CON3 0x0f4e +#define MT6328_CHR_CON4 0x0f50 +#define MT6328_CHR_CON5 0x0f52 +#define MT6328_CHR_CON6 0x0f54 +#define MT6328_CHR_CON7 0x0f56 +#define MT6328_CHR_CON8 0x0f58 +#define MT6328_CHR_CON9 0x0f5a +#define MT6328_CHR_CON10 0x0f5c +#define MT6328_CHR_CON11 0x0f5e +#define MT6328_CHR_CON12 0x0f60 +#define MT6328_CHR_CON13 0x0f62 +#define MT6328_CHR_CON14 0x0f64 +#define MT6328_CHR_CON15 0x0f66 +#define MT6328_CHR_CON16 0x0f68 +#define MT6328_CHR_CON17 0x0f6a +#define MT6328_CHR_CON18 0x0f6c +#define MT6328_CHR_CON19 0x0f6e +#define MT6328_CHR_CON20 0x0f70 +#define MT6328_CHR_CON21 0x0f72 +#define MT6328_CHR_CON22 0x0f74 +#define MT6328_CHR_CON23 0x0f76 +#define MT6328_CHR_CON24 0x0f78 +#define MT6328_CHR_CON25 0x0f7a +#define MT6328_CHR_CON26 0x0f7c +#define MT6328_CHR_CON27 0x0f7e +#define MT6328_CHR_CON28 0x0f80 +#define MT6328_CHR_CON29 0x0f82 +#define MT6328_CHR_CON30 0x0f84 +#define MT6328_CHR_CON31 0x0f86 +#define MT6328_CHR_CON32 0x0f88 +#define MT6328_CHR_CON33 0x0f8a +#define MT6328_CHR_CON34 0x0f8c +#define MT6328_CHR_CON35 0x0f8e +#define MT6328_CHR_CON36 0x0f90 +#define MT6328_CHR_CON37 0x0f92 +#define MT6328_CHR_CON38 0x0f94 +#define MT6328_CHR_CON39 0x0f96 +#define MT6328_CHR_CON40 0x0f98 +#define MT6328_CHR_CON41 0x0f9a +#define MT6328_CHR_CON42 0x0f9c +#define MT6328_BATON_CON0 0x0f9e +#define MT6328_CHR_CON43 0x0fa0 +#define MT6328_EOSC_CALI_CON0 0x0faa +#define MT6328_EOSC_CALI_CON1 0x0fac +#define MT6328_VRTC_PWM_CON0 0x0fae + +#endif /* __MFD_MT6328_REGISTERS_H__ */ diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index 627487e26287..b774c3a4bb62 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -12,6 +12,7 @@ enum chip_id { MT6323_CHIP_ID = 0x23, + MT6328_CHIP_ID = 0x30, MT6331_CHIP_ID = 0x20, MT6332_CHIP_ID = 0x20, MT6357_CHIP_ID = 0x57, @@ -65,11 +66,11 @@ struct mt6397_chip { int irq; struct irq_domain *irq_domain; struct mutex irqlock; - u16 wake_mask[2]; - u16 irq_masks_cur[2]; - u16 irq_masks_cache[2]; - u16 int_con[2]; - u16 int_status[2]; + u16 wake_mask[3]; + u16 irq_masks_cur[3]; + u16 irq_masks_cache[3]; + u16 int_con[3]; + u16 int_status[3]; u16 chip_id; void *irq_data; }; -- cgit v1.2.3 From f187b9bf1a639090893c31030ddb60f9beae23f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Oct 2024 06:18:52 +0100 Subject: block: remove bio_add_zone_append_page This is only used by the nvmet zns passthrough code, which can trivially just use bio_add_pc_page and do the sanity check for the max zone append limit itself. All future zoned file systems should follow the btrfs lead and let the upper layers fill up bios unlimited by hardware constraints and split them to the limits in the I/O submission handler. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241030051859.280923-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index faceadb040f9..4a1bf43ca53d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -418,8 +418,6 @@ bool __must_check bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); -int bio_add_zone_append_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, -- cgit v1.2.3 From 16aed0a6520ba01b7d22c32e193fc1ec674f92d4 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 21 Oct 2024 11:45:06 -0400 Subject: i3c: master: Replace hard code 2 with macro I3C_ADDR_SLOT_STATUS_BITS Replace the hardcoded value 2, which indicates 2 bits for I3C address status, with the predefined macro I3C_ADDR_SLOT_STATUS_BITS. Improve maintainability and extensibility of the code. Reviewed-by: Miquel Raynal Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20241021-i3c_dts_assign-v8-1-4098b8bde01e@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 2a1ed05d5782..2100547b2d8d 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -313,6 +313,8 @@ enum i3c_addr_slot_status { I3C_ADDR_SLOT_STATUS_MASK = 3, }; +#define I3C_ADDR_SLOT_STATUS_BITS 2 + /** * struct i3c_bus - I3C bus object * @cur_master: I3C master currently driving the bus. Since I3C is multi-master @@ -354,7 +356,7 @@ enum i3c_addr_slot_status { struct i3c_bus { struct i3c_dev_desc *cur_master; int id; - unsigned long addrslots[((I2C_MAX_ADDR + 1) * 2) / BITS_PER_LONG]; + unsigned long addrslots[((I2C_MAX_ADDR + 1) * I3C_ADDR_SLOT_STATUS_BITS) / BITS_PER_LONG]; enum i3c_bus_mode mode; struct { unsigned long i3c; -- cgit v1.2.3 From 2f552fa280590e61bd3dbe66a7b54b99caa642a4 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 21 Oct 2024 11:45:07 -0400 Subject: i3c: master: Extend address status bit to 4 and add I3C_ADDR_SLOT_EXT_DESIRED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the address status bit to 4 and introduce the I3C_ADDR_SLOT_EXT_DESIRED macro to indicate that a device prefers a specific address. This is generally set by the 'assigned-address' in the device tree source (dts) file. ┌────┬─────────────┬───┬─────────┬───┐ │S/Sr│ 7'h7E RnW=0 │ACK│ ENTDAA │ T ├────┐ └────┴─────────────┴───┴─────────┴───┘ │ ┌─────────────────────────────────────────┘ │ ┌──┬─────────────┬───┬─────────────────┬────────────────┬───┬─────────┐ └─►│Sr│7'h7E RnW=1 │ACK│48bit UID BCR DCR│Assign 7bit Addr│PAR│ ACK/NACK│ └──┴─────────────┴───┴─────────────────┴────────────────┴───┴─────────┘ Some master controllers (such as HCI) need to prepare the entire above transaction before sending it out to the I3C bus. This means that a 7-bit dynamic address needs to be allocated before knowing the target device's UID information. However, some I3C targets may request specific addresses (called as "init_dyn_addr"), which is typically specified by the DT-'s assigned-address property. Lower addresses having higher IBI priority. If it is available, i3c_bus_get_free_addr() preferably return a free address that is not in the list of desired addresses (called as "init_dyn_addr"). This allows the device with the "init_dyn_addr" to switch to its "init_dyn_addr" when it hot-joins the I3C bus. Otherwise, if the "init_dyn_addr" is already in use by another I3C device, the target device will not be able to switch to its desired address. If the previous step fails, fallback returning one of the remaining unassigned address, regardless of its state in the desired list. Reviewed-by: Miquel Raynal Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20241021-i3c_dts_assign-v8-2-4098b8bde01e@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 2100547b2d8d..6e5328c6c6af 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -298,7 +298,8 @@ enum i3c_open_drain_speed { * @I3C_ADDR_SLOT_I2C_DEV: address is assigned to an I2C device * @I3C_ADDR_SLOT_I3C_DEV: address is assigned to an I3C device * @I3C_ADDR_SLOT_STATUS_MASK: address slot mask - * + * @I3C_ADDR_SLOT_EXT_DESIRED: the bitmask represents addresses that are preferred by some devices, + * such as the "assigned-address" property in a device tree source. * On an I3C bus, addresses are assigned dynamically, and we need to know which * addresses are free to use and which ones are already assigned. * @@ -311,9 +312,11 @@ enum i3c_addr_slot_status { I3C_ADDR_SLOT_I2C_DEV, I3C_ADDR_SLOT_I3C_DEV, I3C_ADDR_SLOT_STATUS_MASK = 3, + I3C_ADDR_SLOT_EXT_STATUS_MASK = 7, + I3C_ADDR_SLOT_EXT_DESIRED = BIT(2), }; -#define I3C_ADDR_SLOT_STATUS_BITS 2 +#define I3C_ADDR_SLOT_STATUS_BITS 4 /** * struct i3c_bus - I3C bus object -- cgit v1.2.3 From a911bad094b010e276f072fe9a599b66e59ed5fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 29 Oct 2024 19:14:25 +0000 Subject: dql: annotate data-races around dql->last_obj_cnt dql->last_obj_cnt is read/written from different contexts, without any lock synchronization. Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing. Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241029191425.2519085-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/dynamic_queue_limits.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h index 281298e77a15..808b1a5102e7 100644 --- a/include/linux/dynamic_queue_limits.h +++ b/include/linux/dynamic_queue_limits.h @@ -127,7 +127,7 @@ static inline void dql_queued(struct dql *dql, unsigned int count) if (WARN_ON_ONCE(count > DQL_MAX_OBJECT)) return; - dql->last_obj_cnt = count; + WRITE_ONCE(dql->last_obj_cnt, count); /* We want to force a write first, so that cpu do not attempt * to get cache line containing last_obj_cnt, num_queued, adj_limit -- cgit v1.2.3 From d3eeb1ac0b99cdcd99420fb270041da946d2d360 Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Mon, 28 Oct 2024 22:45:30 +0100 Subject: iio: backend: extend features Extend backend features with new calls needed later on this patchset from axi version of ad3552r. The follwoing calls are added: iio_backend_ddr_enable() enable ddr bus transfer iio_backend_ddr_disable() disable ddr bus transfer iio_backend_data_stream_enable() enable data stream over bus interface iio_backend_data_stream_disable() disable data stream over bus interface iio_backend_data_transfer_addr() define the target register address where the DAC sample will be written. Reviewed-by: Nuno Sa Signed-off-by: Angelo Dureghello Reviewed-by: David Lechner Link: https://patch.msgid.link/20241028-wip-bl-ad3552r-axi-v0-iio-testing-v9-3-f6960b4f9719@kernel-space.org Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 37d56914d485..10be00f3b120 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -14,12 +14,14 @@ struct iio_dev; enum iio_backend_data_type { IIO_BACKEND_TWOS_COMPLEMENT, IIO_BACKEND_OFFSET_BINARY, + IIO_BACKEND_DATA_UNSIGNED, IIO_BACKEND_DATA_TYPE_MAX }; enum iio_backend_data_source { IIO_BACKEND_INTERNAL_CONTINUOUS_WAVE, IIO_BACKEND_EXTERNAL, + IIO_BACKEND_INTERNAL_RAMP_16BIT, IIO_BACKEND_DATA_SOURCE_MAX }; @@ -89,6 +91,11 @@ enum iio_backend_sample_trigger { * @read_raw: Read a channel attribute from a backend device * @debugfs_print_chan_status: Print channel status into a buffer. * @debugfs_reg_access: Read or write register value of backend. + * @ddr_enable: Enable interface DDR (Double Data Rate) mode. + * @ddr_disable: Disable interface DDR (Double Data Rate) mode. + * @data_stream_enable: Enable data stream. + * @data_stream_disable: Disable data stream. + * @data_transfer_addr: Set data address. **/ struct iio_backend_ops { int (*enable)(struct iio_backend *back); @@ -129,6 +136,11 @@ struct iio_backend_ops { size_t len); int (*debugfs_reg_access)(struct iio_backend *back, unsigned int reg, unsigned int writeval, unsigned int *readval); + int (*ddr_enable)(struct iio_backend *back); + int (*ddr_disable)(struct iio_backend *back); + int (*data_stream_enable)(struct iio_backend *back); + int (*data_stream_disable)(struct iio_backend *back); + int (*data_transfer_addr)(struct iio_backend *back, u32 address); }; /** @@ -164,6 +176,11 @@ int iio_backend_data_sample_trigger(struct iio_backend *back, int devm_iio_backend_request_buffer(struct device *dev, struct iio_backend *back, struct iio_dev *indio_dev); +int iio_backend_ddr_enable(struct iio_backend *back); +int iio_backend_ddr_disable(struct iio_backend *back); +int iio_backend_data_stream_enable(struct iio_backend *back); +int iio_backend_data_stream_disable(struct iio_backend *back); +int iio_backend_data_transfer_addr(struct iio_backend *back, u32 address); ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private, const struct iio_chan_spec *chan, const char *buf, size_t len); -- cgit v1.2.3 From e9f0a363473585a0f51b1b61a9bb15a63808d6ea Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Oct 2024 13:01:12 +0200 Subject: tracing: Remove TRACE_FLAG_IRQS_NOSUPPORT It was possible to enable tracing with no IRQ tracing support. The tracing infrastructure would then record TRACE_FLAG_IRQS_NOSUPPORT as the only tracing flag and show an 'X' in the output. The last user of this feature was PPC32 which managed to implement it during PowerPC merge in 2009. Since then, it was unused and the PPC32 dependency was finally removed in commit 0ea5ee035133a ("tracing: Remove PPC32 wart from config TRACING_SUPPORT"). Since the PowerPC merge the code behind !CONFIG_TRACE_IRQFLAGS_SUPPORT with TRACING enabled can no longer be selected used and the 'X' is not displayed or recorded. Remove the CONFIG_TRACE_IRQFLAGS_SUPPORT from the tracing code. Remove TRACE_FLAG_IRQS_NOSUPPORT. Cc: Peter Zijlstra Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241022110112.XJI8I9T2@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f8f2e52653df..016b29a56c87 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -184,7 +184,6 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_IRQS_NOSUPPORT = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, @@ -193,7 +192,6 @@ enum trace_flag_type { TRACE_FLAG_BH_OFF = 0x80, }; -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { unsigned int irq_status = irqs_disabled_flags(irqflags) ? @@ -207,17 +205,6 @@ static inline unsigned int tracing_gen_ctx(void) local_save_flags(irqflags); return tracing_gen_ctx_flags(irqflags); } -#else - -static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) -{ - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); -} -static inline unsigned int tracing_gen_ctx(void) -{ - return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); -} -#endif static inline unsigned int tracing_gen_ctx_dec(void) { -- cgit v1.2.3 From a9cfb8778c43fc473ae16cddb6e9611705721b31 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 31 Oct 2024 11:20:53 -0400 Subject: tracing: Introduce tracepoint extended structure Shrink the struct tracepoint size from 80 bytes to 72 bytes on x86-64 by moving the (typically NULL) regfunc/unregfunc pointers to an extended structure. Tested-by: Jordan Rife Cc: Michael Jeanson Cc: Thomas Gleixner Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241031152056.744137-2-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint-defs.h | 8 ++++++-- include/linux/tracepoint.h | 19 +++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 60a6e8314d4c..967c08d9da84 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -29,6 +29,11 @@ struct tracepoint_func { int prio; }; +struct tracepoint_ext { + int (*regfunc)(void); + void (*unregfunc)(void); +}; + struct tracepoint { const char *name; /* Tracepoint name */ struct static_key_false key; @@ -36,9 +41,8 @@ struct tracepoint { void *static_call_tramp; void *iterator; void *probestub; - int (*regfunc)(void); - void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; + struct tracepoint_ext *ext; }; #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 0dc67fad706c..862ab49177a4 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -302,7 +302,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * structures, so we create an array of pointers that will be used for iteration * on the tracepoints. */ -#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \ +#define __DEFINE_TRACE_EXT(_name, _ext, proto, args) \ static const char __tpstrtab_##_name[] \ __section("__tracepoints_strings") = #_name; \ extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ @@ -316,9 +316,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ .iterator = &__traceiter_##_name, \ .probestub = &__probestub_##_name, \ - .regfunc = _reg, \ - .unregfunc = _unreg, \ - .funcs = NULL }; \ + .funcs = NULL, \ + .ext = _ext, \ + }; \ __TRACEPOINT_ENTRY(_name); \ int __traceiter_##_name(void *__data, proto) \ { \ @@ -341,8 +341,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); -#define DEFINE_TRACE(name, proto, args) \ - DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args)); +#define DEFINE_TRACE_FN(_name, _reg, _unreg, _proto, _args) \ + static struct tracepoint_ext __tracepoint_ext_##_name = { \ + .regfunc = _reg, \ + .unregfunc = _unreg, \ + }; \ + __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); + +#define DEFINE_TRACE(_name, _proto, _args) \ + __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args)); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name); \ -- cgit v1.2.3 From 654ced4a13774e5aec4d9466c97d22df7cb1e60b Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 31 Oct 2024 11:20:54 -0400 Subject: tracing: Introduce tracepoint_is_faultable() Introduce a "faultable" flag within the extended structure to know whether a tracepoint needs rcu tasks trace grace period before reclaim. This can be queried using tracepoint_is_faultable(). Acked-by: Andrii Nakryiko Tested-by: Jordan Rife Cc: Michael Jeanson Cc: Thomas Gleixner Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241031152056.744137-3-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint-defs.h | 2 ++ include/linux/tracepoint.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 967c08d9da84..aebf0571c736 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -32,6 +32,8 @@ struct tracepoint_func { struct tracepoint_ext { int (*regfunc)(void); void (*unregfunc)(void); + /* Flags. */ + unsigned int faultable:1; }; struct tracepoint { diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 862ab49177a4..906f3091d23d 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -104,6 +104,12 @@ void for_each_tracepoint_in_module(struct module *mod, * tracepoint_synchronize_unregister must be called between the last tracepoint * probe unregistration and the end of module exit to make sure there is no * caller executing a probe when it is freed. + * + * An alternative is to use the following for batch reclaim associated + * with a given tracepoint: + * + * - tracepoint_is_faultable() == false: call_rcu() + * - tracepoint_is_faultable() == true: call_rcu_tasks_trace() */ #ifdef CONFIG_TRACEPOINTS static inline void tracepoint_synchronize_unregister(void) @@ -111,9 +117,17 @@ static inline void tracepoint_synchronize_unregister(void) synchronize_rcu_tasks_trace(); synchronize_rcu(); } +static inline bool tracepoint_is_faultable(struct tracepoint *tp) +{ + return tp->ext && tp->ext->faultable; +} #else static inline void tracepoint_synchronize_unregister(void) { } +static inline bool tracepoint_is_faultable(struct tracepoint *tp) +{ + return false; +} #endif #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS @@ -345,6 +359,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static struct tracepoint_ext __tracepoint_ext_##_name = { \ .regfunc = _reg, \ .unregfunc = _unreg, \ + .faultable = false, \ + }; \ + __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); + +#define DEFINE_TRACE_SYSCALL(_name, _reg, _unreg, _proto, _args) \ + static struct tracepoint_ext __tracepoint_ext_##_name = { \ + .regfunc = _reg, \ + .unregfunc = _unreg, \ + .faultable = true, \ }; \ __DEFINE_TRACE_EXT(_name, &__tracepoint_ext_##_name, PARAMS(_proto), PARAMS(_args)); @@ -389,6 +412,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_SYSCALL __DECLARE_TRACE #define DEFINE_TRACE_FN(name, reg, unreg, proto, args) +#define DEFINE_TRACE_SYSCALL(name, reg, unreg, proto, args) #define DEFINE_TRACE(name, proto, args) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) #define EXPORT_TRACEPOINT_SYMBOL(name) -- cgit v1.2.3 From ee3685a98ea920369696d31e54d1fe6a939ed65f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 31 Oct 2024 11:20:56 -0400 Subject: tracing: Add might_fault() check in __DECLARE_TRACE_SYSCALL Catch incorrect use of syscall tracepoints even if no probes are registered by adding a might_fault() check in trace_##name() emitted by __DECLARE_TRACE_SYSCALL. Suggested-by: Thomas Gleixner Tested-by: Jordan Rife Cc: Thomas Gleixner Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241031152056.744137-5-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 906f3091d23d..425123e921ac 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -301,6 +301,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ static inline void trace_##name(proto) \ { \ + might_fault(); \ if (static_branch_unlikely(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ TP_ARGS(args), \ -- cgit v1.2.3 From 61c6fefa92bb4ed7a34163b94f6ffac628237a29 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 1 Nov 2024 11:17:53 -0700 Subject: bpf: decouple BPF link/attach hook and BPF program sleepable semantics BPF link's lifecycle protection scheme depends on both BPF hook and BPF program. If *either* of those require RCU Tasks Trace GP, then we need to go through a chain of GPs before putting BPF program refcount and deallocating BPF link memory. This patch adds bpf_link-specific sleepable flag, which can be set to true even if underlying BPF program is not sleepable itself. If either link->sleepable or link->prog->sleepable is true, we'll go through a chain of RCU Tasks Trace GP and RCU GP before putting BPF program and freeing memory. This will be used to protect BPF link for sleepable (faultable) raw tracepoints in the next patch. Link: https://lore.kernel.org/20241101181754.782341-2-andrii@kernel.org Tested-by: Jordan Rife Signed-off-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (Google) --- include/linux/bpf.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 19d8ca8ac960..e7236facadd4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1598,6 +1598,11 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; + /* whether BPF link itself has "sleepable" semantics, which can differ + * from underlying BPF program having a "sleepable" semantics, as BPF + * link's semantics is determined by target attach hook + */ + bool sleepable; /* rcu is used before freeing, work can be used to schedule that * RCU-based freeing before that, so they never overlap */ @@ -1614,8 +1619,10 @@ struct bpf_link_ops { */ void (*dealloc)(struct bpf_link *link); /* deallocate link resources callback, called after RCU grace period; - * if underlying BPF program is sleepable we go through tasks trace - * RCU GP and then "classic" RCU GP + * if either the underlying BPF program is sleepable or BPF link's + * target hook is sleepable, we'll go through tasks trace RCU GP and + * then "classic" RCU GP; this need for chaining tasks trace and + * classic RCU GPs is designated by setting bpf_link->sleepable flag */ void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); @@ -2362,6 +2369,9 @@ int bpf_prog_new_fd(struct bpf_prog *prog); void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog); +void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable); int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); @@ -2717,6 +2727,12 @@ static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, { } +static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable) +{ +} + static inline int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { -- cgit v1.2.3 From d44d26987bb3df6d76556827097fc9ce17565cb8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 31 Oct 2024 13:04:07 +0100 Subject: timekeeping: Remove CONFIG_DEBUG_TIMEKEEPING Since 135225a363ae timekeeping_cycles_to_ns() handles large offsets which would lead to 64bit multiplication overflows correctly. It's also protected against negative motion of the clocksource unconditionally, which was exclusive to x86 before. timekeeping_advance() handles large offsets already correctly. That means the value of CONFIG_DEBUG_TIMEKEEPING which analyzed these cases is very close to zero. Remove all of it. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20241031120328.536010148@linutronix.de --- include/linux/timekeeper_internal.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index a3b6380a7777..e39d4d563b19 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -76,9 +76,6 @@ struct tk_read_base { * ntp shifted nano seconds. * @ntp_err_mult: Multiplication factor for scaled math conversion * @skip_second_overflow: Flag used to avoid updating NTP twice with same second - * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) - * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) - * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffy times) @@ -147,19 +144,6 @@ struct timekeeper { u32 ntp_error_shift; u32 ntp_err_mult; u32 skip_second_overflow; - -#ifdef CONFIG_DEBUG_TIMEKEEPING - long last_warning; - /* - * These simple flag variables are managed - * without locks, which is racy, but they are - * ok since we don't really care about being - * super precise about how many events were - * seen, just that a problem was observed. - */ - int underflow_seen; - int overflow_seen; -#endif }; #ifdef CONFIG_GENERIC_TIME_VSYSCALL -- cgit v1.2.3 From 2a69297eed87c1f3ad33b8a169025c06adde5dcf Mon Sep 17 00:00:00 2001 From: Qi Tao Date: Sat, 26 Oct 2024 17:46:51 +0800 Subject: crypto: hisilicon - support querying the capability register Query the capability register status of accelerator devices (SEC, HPRE and ZIP) through the debugfs interface, for example: cat cap_regs. The purpose is to improve the robustness and locability of hardware devices and drivers. Signed-off-by: Qi Tao Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 389e95754776..7272eac850e8 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -274,13 +274,25 @@ struct hisi_qm_cap_info { u32 v3_val; }; +struct hisi_qm_cap_query_info { + u32 type; + const char *name; + u32 offset; + u32 v1_val; + u32 v2_val; + u32 v3_val; +}; + struct hisi_qm_cap_record { u32 type; + const char *name; u32 cap_val; }; struct hisi_qm_cap_tables { + u32 qm_cap_size; struct hisi_qm_cap_record *qm_cap_table; + u32 dev_cap_size; struct hisi_qm_cap_record *dev_cap_table; }; @@ -554,6 +566,9 @@ void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset); u32 hisi_qm_get_hw_info(struct hisi_qm *qm, const struct hisi_qm_cap_info *info_table, u32 index, bool is_read); +u32 hisi_qm_get_cap_value(struct hisi_qm *qm, + const struct hisi_qm_cap_query_info *info_table, + u32 index, bool is_read); int hisi_qm_set_algs(struct hisi_qm *qm, u64 alg_msk, const struct qm_dev_alg *dev_algs, u32 dev_algs_size); -- cgit v1.2.3 From c418ba6baca3ae10ffaf47b0803d2a9e6bf1af96 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 26 Oct 2024 19:44:29 +0800 Subject: crypto: hisilicon/qm - disable same error report before resetting If an error indicating that the device needs to be reset is reported, disable the error reporting before device reset is complete, enable the error reporting after the reset is complete to prevent the same error from being reported repeatedly. Fixes: eaebf4c3b103 ("crypto: hisilicon - Unify hardware error init/uninit into QM") Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 7272eac850e8..6dbd0d49628f 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -229,6 +229,12 @@ struct hisi_qm_status { struct hisi_qm; +enum acc_err_result { + ACC_ERR_NONE, + ACC_ERR_NEED_RESET, + ACC_ERR_RECOVERED, +}; + struct hisi_qm_err_info { char *acpi_rst; u32 msi_wr_port; @@ -257,9 +263,9 @@ struct hisi_qm_err_ini { void (*close_axi_master_ooo)(struct hisi_qm *qm); void (*open_sva_prefetch)(struct hisi_qm *qm); void (*close_sva_prefetch)(struct hisi_qm *qm); - void (*log_dev_hw_err)(struct hisi_qm *qm, u32 err_sts); void (*show_last_dfx_regs)(struct hisi_qm *qm); void (*err_info_init)(struct hisi_qm *qm); + enum acc_err_result (*get_err_result)(struct hisi_qm *qm); }; struct hisi_qm_cap_info { -- cgit v1.2.3 From f6ca73063754950bf3fbad753e3a9557e3aa85e3 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sat, 2 Nov 2024 14:28:41 +0100 Subject: i3c: Document I3C_ADDR_SLOT_EXT_STATUS_MASK As the mask is part of the enum, document it. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/r/20241102132841.2446176-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 6e5328c6c6af..12d532b012c5 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -298,6 +298,7 @@ enum i3c_open_drain_speed { * @I3C_ADDR_SLOT_I2C_DEV: address is assigned to an I2C device * @I3C_ADDR_SLOT_I3C_DEV: address is assigned to an I3C device * @I3C_ADDR_SLOT_STATUS_MASK: address slot mask + * @I3C_ADDR_SLOT_EXT_STATUS_MASK: address slot mask with extended information * @I3C_ADDR_SLOT_EXT_DESIRED: the bitmask represents addresses that are preferred by some devices, * such as the "assigned-address" property in a device tree source. * On an I3C bus, addresses are assigned dynamically, and we need to know which -- cgit v1.2.3 From 7029acd8a950393ee3a3d8e1a7ee1a9b77808a3b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 25 Oct 2024 19:27:39 -0600 Subject: io_uring/rsrc: get rid of per-ring io_rsrc_node list Work in progress, but get rid of the per-ring serialization of resource nodes, like registered buffers and files. Main issue here is that one node can otherwise hold up a bunch of other nodes from getting freed, which is especially a problem for file resource nodes and networked workloads where some descriptors may not see activity in a long time. As an example, instantiate an io_uring ring fd and create a sparse registered file table. Even 2 will do. Then create a socket and register it as fixed file 0, F0. The number of open files in the app is now 5, with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4 being the socket. Register this socket (eg "the listener") in slot 0 of the registered file table. Now add an operation on the socket that uses slot 0. Finally, loop N times, where each loop creates a new socket, registers said socket as a file, then unregisters the socket, and finally closes the socket. This is roughly similar to what a basic accept loop would look like. At the end of this loop, it's not unreasonable to expect that there would still be 5 open files. Each socket created and registered in the loop is also unregistered and closed. But since the listener socket registered first still has references to its resource node due to still being active, each subsequent socket unregistration is stuck behind it for reclaim. Hence 5 + N files are still open at that point, where N is awaiting the final put held up by the listener socket. Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct io_kiocb now gets explicit resource nodes assigned, with each holding a reference to the parent node. A parent node is either of type FILE or BUFFER, which are the two types of nodes that exist. A request can have two nodes assigned, if it's using both registered files and buffers. Since request issue and task_work completion is both under the ring private lock, no atomics are needed to handle these references. It's a simple unlocked inc/dec. As before, the registered buffer or file table each hold a reference as well to the registered nodes. Final put of the node will remove the node and free the underlying resource, eg unmap the buffer or put the file. Outside of removing the stall in resource reclaim described above, it has the following advantages: 1) It's a lot simpler than the previous scheme, and easier to follow. No need to specific quiesce handling anymore. 2) There are no resource node allocations in the fast path, all of that happens at resource registration time. 3) The structs related to resource handling can all get simplified quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can go away completely. 4) Handling of resource tags is much simpler, and doesn't require persistent storage as it can simply get assigned up front at registration time. Just copy them in one-by-one at registration time and assign to the resource node. The only real downside is that a request is now explicitly limited to pinning 2 resources, one file and one buffer, where before just assigning a resource node to a request would pin all of them. The upside is that it's easier to follow now, as an individual resource is explicitly referenced and assigned to the request. With this in place, the above mentioned example will be using exactly 5 files at the end of the loop, not N. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d4ba4ae480d6..42c5f2c992c4 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -56,7 +56,7 @@ struct io_wq_work { }; struct io_file_table { - struct io_fixed_file *files; + struct io_rsrc_node **nodes; unsigned long *bitmap; unsigned int alloc_hint; }; @@ -264,7 +264,6 @@ struct io_ring_ctx { * Fixed resources fast path, should be accessed only under * uring_lock, and updated through io_uring_register(2) */ - struct io_rsrc_node *rsrc_node; atomic_t cancel_seq; /* @@ -277,7 +276,7 @@ struct io_ring_ctx { struct io_wq_work_list iopoll_list; struct io_file_table file_table; - struct io_mapped_ubuf **user_bufs; + struct io_rsrc_node **user_bufs; unsigned nr_user_files; unsigned nr_user_bufs; @@ -372,10 +371,7 @@ struct io_ring_ctx { struct io_rsrc_data *buf_data; /* protected by ->uring_lock */ - struct list_head rsrc_ref_list; struct io_alloc_cache rsrc_node_cache; - struct wait_queue_head rsrc_quiesce_wq; - unsigned rsrc_quiesce; u32 pers_next; struct xarray personalities; @@ -642,7 +638,7 @@ struct io_kiocb { __poll_t apoll_events; }; - struct io_rsrc_node *rsrc_node; + struct io_rsrc_node *rsrc_nodes[2]; atomic_t refs; bool cancel_seq_set; -- cgit v1.2.3 From fbbb8e991d86bb7539de6161746b6c747f93f533 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 26 Oct 2024 06:43:44 -0600 Subject: io_uring/rsrc: get rid of io_rsrc_node allocation cache It's not going to be needed in the fast path going forward, so kill it off. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 42c5f2c992c4..696f2a05a98b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -370,9 +370,6 @@ struct io_ring_ctx { struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; - /* protected by ->uring_lock */ - struct io_alloc_cache rsrc_node_cache; - u32 pers_next; struct xarray personalities; -- cgit v1.2.3 From 3597f2786b687a7f26361ce00a805ea0af41b65f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 26 Oct 2024 14:50:13 -0600 Subject: io_uring/rsrc: unify file and buffer resource tables For files, there's nr_user_files/file_table/file_data, and buffers have nr_user_bufs/user_bufs/buf_data. There's no reason why file_table and file_data can't be the same thing, and ditto for the buffer side. That gets rid of more io_ring_ctx state that's in two spots rather than just being in one spot, as it should be. Put all the registered file data in one locations, and ditto on the buffer front. This also avoids having both io_rsrc_data->nodes being an allocated array, and ->user_bufs[] or ->file_table.nodes. There's no reason to have this information duplicated. Keep it in one spot, io_rsrc_data, along with how many resources are available. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 696f2a05a98b..77fd508d043a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -55,8 +55,13 @@ struct io_wq_work { int cancel_seq; }; +struct io_rsrc_data { + unsigned int nr; + struct io_rsrc_node **nodes; +}; + struct io_file_table { - struct io_rsrc_node **nodes; + struct io_rsrc_data data; unsigned long *bitmap; unsigned int alloc_hint; }; @@ -276,9 +281,7 @@ struct io_ring_ctx { struct io_wq_work_list iopoll_list; struct io_file_table file_table; - struct io_rsrc_node **user_bufs; - unsigned nr_user_files; - unsigned nr_user_bufs; + struct io_rsrc_data buf_table; struct io_submit_state submit_state; @@ -366,10 +369,6 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; - /* slow path rsrc auxilary data, used by update/register */ - struct io_rsrc_data *file_data; - struct io_rsrc_data *buf_data; - u32 pers_next; struct xarray personalities; -- cgit v1.2.3 From 01ee194d1aba1202f0926d5047a2a4cf84d0e45d Mon Sep 17 00:00:00 2001 From: hexue Date: Fri, 1 Nov 2024 17:19:57 +0800 Subject: io_uring: add support for hybrid IOPOLL A new hybrid poll is implemented on the io_uring layer. Once an IO is issued, it will not poll immediately, but rather block first and re-run before IO complete, then poll to reap IO. While this poll method could be a suboptimal solution when running on a single thread, it offers performance lower than regular polling but higher than IRQ, and CPU utilization is also lower than polling. To use hybrid polling, the ring must be setup with both the IORING_SETUP_IOPOLL and IORING_SETUP_HYBRID)IOPOLL flags set. Hybrid polling has the same restrictions as IOPOLL, in that commands must explicitly support it. Signed-off-by: hexue Link: https://lore.kernel.org/r/20241101091957.564220-2-xue01.he@samsung.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 77fd508d043a..d52fec533c51 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -298,6 +298,11 @@ struct io_ring_ctx { * ->uring_cmd() by io_uring_cmd_insert_cancelable() */ struct hlist_head cancelable_uring_cmd; + /* + * For Hybrid IOPOLL, runtime in hybrid polling, without + * scheduling time + */ + u64 hybrid_poll_time; } ____cacheline_aligned_in_smp; struct { @@ -449,6 +454,7 @@ enum { REQ_F_LINK_TIMEOUT_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, + REQ_F_HYBRID_IOPOLL_STATE_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_BUFFER_RING_BIT, REQ_F_REISSUE_BIT, @@ -507,6 +513,8 @@ enum { REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), + /* every req only blocks once in hybrid poll */ + REQ_F_IOPOLL_STATE = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), /* buffer selected from ring, needs commit */ @@ -639,8 +647,15 @@ struct io_kiocb { atomic_t refs; bool cancel_seq_set; struct io_task_work io_task_work; - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - struct hlist_node hash_node; + union { + /* + * for polled requests, i.e. IORING_OP_POLL_ADD and async armed + * poll + */ + struct hlist_node hash_node; + /* For IOPOLL setup queues, with hybrid polling */ + u64 iopoll_start; + }; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ -- cgit v1.2.3 From 53c0a58beb60b76e105a61aae518fd780eec03d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 25 May 2024 23:32:20 -0400 Subject: net/socket.c: switch to CLASS(fd) The important part in sockfd_lookup_light() is avoiding needless file refcount operations, not the marginal reduction of the register pressure from not keeping a struct file pointer in the caller. Switch to use fdget()/fdpu(); with sane use of CLASS(fd) we can get a better code generation... Would be nice if somebody tested it on networking test suites (including benchmarks)... sockfd_lookup_light() does fdget(), uses sock_from_file() to get the associated socket and returns the struct socket reference to the caller, along with "do we need to fput()" flag. No matching fdput(), the caller does its equivalent manually, using the fact that sock->file points to the struct file the socket has come from. Get rid of that - have the callers do fdget()/fdput() and use sock_from_file() directly. That kills sockfd_lookup_light() and fput_light() (no users left). What's more, we can get rid of explicit fdget()/fdput() by switching to CLASS(fd, ...) - code generation does not suffer, since now fdput() inserted on "descriptor is not opened" failure exit is recognized to be a no-op by compiler. [folded a fix for braino in do_recvmmsg() caught by Simon Horman] Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/file.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index f98de143245a..b49a92295b3f 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -30,12 +30,6 @@ extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); -static inline void fput_light(struct file *file, int fput_needed) -{ - if (fput_needed) - fput(file); -} - /* either a reference to struct file + flags * (cloned vs. borrowed, pos locked), with * flags stored in lower bits of value, -- cgit v1.2.3 From f302edb9d822804e72df3fa6ba270234050c678b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Jul 2024 21:49:04 -0400 Subject: switch netlink_getsockbyfilp() to taking descriptor the only call site (in do_mq_notify()) obtains the argument from an immediately preceding fdget() and it is immediately followed by fdput(); might as well just replace it with a variant that would take a descriptor instead of struct file * and have file lookups handled inside that function. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index b332c2048c75..a48a30842d84 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -239,7 +239,7 @@ int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ -struct sock *netlink_getsockbyfilp(struct file *filp); +struct sock *netlink_getsockbyfd(int fd); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); -- cgit v1.2.3 From d7a9616ce0348b9d945d5dff82e4b44c0fe75b39 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 31 May 2024 22:10:12 -0400 Subject: introduce "fd_pos" class, convert fdget_pos() users to it. fdget_pos() for constructor, fdput_pos() for cleanup, all users of fd..._pos() converted trivially. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index b49a92295b3f..4b09a8de2fd5 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -81,6 +81,7 @@ static inline void fdput_pos(struct fd f) DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd) DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd) +DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd) extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); -- cgit v1.2.3 From 38052c2dd71f5490f34bba21dc358e97fb205ee5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 6 Jun 2024 19:29:04 -0400 Subject: deal with the last remaing boolean uses of fd_file() Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/cleanup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 038b2d523bf8..875c998275c0 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -234,7 +234,7 @@ const volatile void * __must_check_fn(const volatile void *val) * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd) * * CLASS(fdget, f)(fd); - * if (!fd_file(f)) + * if (fd_empty(f)) * return -EBADF; * * // use 'f' without concern -- cgit v1.2.3 From a1afb959add1fad43cb337448c244ed70bac3109 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 30 Oct 2024 09:11:56 +0100 Subject: dpll: add clock quality level attribute and op In order to allow driver expose quality level of the clock it is running, introduce a new netlink attr with enum to carry it to the userspace. Also, introduce an op the dpll netlink code calls into the driver to obtain the value. Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20241030081157.966604-2-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 81f7b623d0ba..5e4f9ab1cf75 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -26,6 +26,10 @@ struct dpll_device_ops { struct netlink_ext_ack *extack); int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv, s32 *temp, struct netlink_ext_ack *extack); + int (*clock_quality_level_get)(const struct dpll_device *dpll, + void *dpll_priv, + unsigned long *qls, + struct netlink_ext_ack *extack); }; struct dpll_pin_ops { -- cgit v1.2.3 From a8f80673ca0daaad990882529a5b4dc5114071e7 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 24 Oct 2024 17:37:41 +0800 Subject: compiler_types: Add noinline_for_tracing annotation Kernel functions that are not inlined can be easily hooked with BPF for tracing. However, functions intended for tracing may still be inlined unexpectedly. For example, in our case, after upgrading the compiler from GCC 9 to GCC 11, the tcp_drop_reason() function was inlined, which broke our monitoring tools. To prevent this, we need to ensure that the function remains non-inlined. The noinline_for_tracing annotation is introduced as a general solution for preventing inlining of kernel functions that need to be traced. This approach avoids the need for adding individual noinline comments to each function and provides a more consistent way to maintain traceability. Link: https://lore.kernel.org/netdev/CANn89iKvr44ipuRYFaPTpzwz=B_+pgA94jsggQ946mjwreV6Aw@mail.gmail.com/ Suggested-by: Eric Dumazet Signed-off-by: Yafang Shao Link: https://patch.msgid.link/20241024093742.87681-2-laoar.shao@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/compiler_types.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1a957ea2f4fe..0c8b9601e603 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -265,6 +265,12 @@ struct ftrace_likely_data { */ #define noinline_for_stack noinline +/* + * Use noinline_for_tracing for functions that should not be inlined. + * For tracing reasons. + */ +#define noinline_for_tracing noinline + /* * Sanitizer helper attributes: Because using __always_inline and * __no_sanitize_* conflict, provide helper attributes that will either expand -- cgit v1.2.3 From 3bd9b9abdf1563a22041b7255baea6d449902f1a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 29 Oct 2024 15:58:47 -0600 Subject: net: ethtool: Avoid thousands of -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Change the type of the middle struct member currently causing trouble from `struct ethtool_link_settings` to `struct ethtool_link_settings_hdr`. Additionally, update the type of some variables in various functions that don't access the flexible-array member, changing them to the newly created `struct ethtool_link_settings_hdr`. These changes are needed because the type of the conflicting middle members changed. So, those instances that expect the type to be `struct ethtool_link_settings` should be adjusted to the newly created type `struct ethtool_link_settings_hdr`. Also, adjust variable declarations to follow the reverse xmas tree convention. Fix 3338 of the following -Wflex-array-member-not-at-end warnings: include/linux/ethtool.h:214:38: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Link: https://patch.msgid.link/0bc2809fe2a6c11dd4c8a9a10d9bd65cccdb559b.1730238285.git.gustavoars@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 12f6dc567598..1199e308c8dd 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -211,7 +211,7 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); * fields, but they are allowed to overwrite them (will be ignored). */ struct ethtool_link_ksettings { - struct ethtool_link_settings base; + struct ethtool_link_settings_hdr base; struct { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); -- cgit v1.2.3 From 2ea25aab938a250bdf3148acd15359b56b91b40e Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 29 Oct 2024 16:18:49 -0500 Subject: pwm: core: export pwm_get_state_hw() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export the pwm_get_state_hw() function. This is useful in cases where we want to know what the hardware is actually doing, rather than what what we requested it should do. Locking had to be rearranged to ensure that the chip is still operational before trying to access ops now that this can be called from outside the pwm core. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20241029-pwm-export-pwm_get_state_hw-v2-1-03ba063a3230@baylibre.com [ukleinek: Add dummy for !CONFIG_PWM] Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index f1cb1e5b0a36..78827f312407 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -370,6 +370,7 @@ int pwm_get_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf int pwm_set_waveform_might_sleep(struct pwm_device *pwm, const struct pwm_waveform *wf, bool exact); int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state); int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state); +int pwm_get_state_hw(struct pwm_device *pwm, struct pwm_state *state); int pwm_adjust_config(struct pwm_device *pwm); /** @@ -494,6 +495,11 @@ static inline int pwm_apply_atomic(struct pwm_device *pwm, return -EOPNOTSUPP; } +static inline int pwm_get_state_hw(struct pwm_device *pwm, struct pwm_state *state) +{ + return -EOPNOTSUPP; +} + static inline int pwm_adjust_config(struct pwm_device *pwm) { return -EOPNOTSUPP; -- cgit v1.2.3 From 9a5a2483bc60c12d73ac6ca5ac5ab95361a895f4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Nov 2024 12:53:42 +0200 Subject: iio: Mark iio_dev::priv member with __private The member is not supposed to be accessed directly, mark it with __private to catch the misuses up. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20241101105342.3645018-1-andriy.shevchenko@linux.intel.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 445d6666a291..5c6682bd4cb9 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -624,7 +624,7 @@ struct iio_dev { const struct iio_info *info; const struct iio_buffer_setup_ops *setup_ops; - void *priv; + void *priv __private; }; int iio_device_id(struct iio_dev *indio_dev); @@ -785,7 +785,7 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv); /* The information at the returned address is guaranteed to be cacheline aligned */ static inline void *iio_priv(const struct iio_dev *indio_dev) { - return indio_dev->priv; + return ACCESS_PRIVATE(indio_dev, priv); } void iio_device_free(struct iio_dev *indio_dev); -- cgit v1.2.3 From 6e6738398def256126185cd25e2e3cb68f1bc0a3 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Fri, 1 Nov 2024 07:46:27 +0000 Subject: iio: hid-sensors: Add proximity and attention IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HID Usage Table at https://usb.org/sites/default/files/hut1_5.pdf reserves: - 0x4b2 for Human Proximity Range Distance between a human and the computer. Default unit of measure is meters; https://www.usb.org/sites/default/files/hutrr39b_0.pdf - 0x4bd for Human Attention Detected Human-Presence sensors detect the presence of humans in the sensor’s field-of-view using diverse and evolving technologies. Some presence sensors are implemented with low resolution video cameras, which can additionally track a subject’s attention (i.e. if the user is ‘looking’ at the system with the integrated sensor). A Human-Presence sensor, providing a Host with the user’s attention state, allows the Host to optimize its behavior. For example, to brighten/dim the system display, based on the user’s attention to the system (potentially prolonging battery life). Default unit is true/false; https://www.usb.org/sites/default/files/hutrr107-humanpresenceattention_1.pdf Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241101-hpd-v3-1-e9c80b7c7164@chromium.org Signed-off-by: Jonathan Cameron --- include/linux/hid-sensor-ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h index 6730ee900ee1..8a03d9696b1c 100644 --- a/include/linux/hid-sensor-ids.h +++ b/include/linux/hid-sensor-ids.h @@ -30,6 +30,8 @@ #define HID_USAGE_SENSOR_PROX 0x200011 #define HID_USAGE_SENSOR_DATA_PRESENCE 0x2004b0 #define HID_USAGE_SENSOR_HUMAN_PRESENCE 0x2004b1 +#define HID_USAGE_SENSOR_HUMAN_PROXIMITY 0x2004b2 +#define HID_USAGE_SENSOR_HUMAN_ATTENTION 0x2004bd /* Pressure (200031) */ #define HID_USAGE_SENSOR_PRESSURE 0x200031 -- cgit v1.2.3 From b4b42f28a0df6b9d31f0003f7dea3bddf272eaa4 Mon Sep 17 00:00:00 2001 From: Julien Stephan Date: Thu, 31 Oct 2024 16:27:02 +0100 Subject: iio: fix write_event_config signature write_event_config callback use an int for state, but it is actually a boolean. iio_ev_state_store is actually using kstrtobool to check user input, then gives the converted boolean value to write_event_config. Fix signature and update all iio drivers to use the new signature. This patch has been partially written using coccinelle with the following script: $ cat iio-bool.cocci // Options: --all-includes virtual patch @c1@ identifier iioinfo; identifier wecfunc; @@ static const struct iio_info iioinfo = { ..., .write_event_config = ( wecfunc | &wecfunc ), ..., }; @@ identifier c1.wecfunc; identifier indio_dev, chan, type, dir, state; @@ int wecfunc(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, enum iio_event_type type, enum iio_event_direction dir, -int +bool state) { ... } make coccicheck MODE=patch COCCI=iio-bool.cocci M=drivers/iio Unfortunately, this script didn't match all files: * all write_event_config callbacks using iio_device_claim_direct_scoped were not detected and not patched. * all files that do not assign and declare the write_event_config callback in the same file. iio.h was also manually updated. The patch was build tested using allmodconfig config. cc: Julia Lawall Signed-off-by: Julien Stephan Link: https://patch.msgid.link/20241031-iio-fix-write-event-config-signature-v2-7-2bcacbb517a2@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 5c6682bd4cb9..59c58f455311 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -514,7 +514,7 @@ struct iio_info { const struct iio_chan_spec *chan, enum iio_event_type type, enum iio_event_direction dir, - int state); + bool state); int (*read_event_value)(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, -- cgit v1.2.3 From 5d8173b8493151d32b99ec6732fb29c58256a7c8 Mon Sep 17 00:00:00 2001 From: Julien Stephan Date: Mon, 28 Oct 2024 17:38:11 +0100 Subject: iio: events.h: add event identifier macros for differential channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, there are 3 helper macros in iio/events.h to create event identifiers: - IIO_EVENT_CODE : create generic event identifier for differential and non differential channels - IIO_MOD_EVENT_CODE : create event identifier for modified (non differential) channels - IIO_UNMOD_EVENT_CODE : create event identifier for unmodified (non differential) channels For differential channels, drivers are expected to use IIO_EVENT_CODE. However, only one driver in drivers/iio currently uses it correctly, leading to inconsistent event identifiers for differential channels that don’t match the intended attributes (such as max1363.c that supports differential channels, but only uses IIO_UNMOD_EVENT_CODE). To prevent such issues in future drivers, a new helper macro, IIO_DIFF_EVENT_CODE, is introduced to specifically create event identifiers for differential channels. Only one helper is needed for differential channels since they cannot have modifiers. Additionally, the descriptions for IIO_MOD_EVENT_CODE and IIO_UNMOD_EVENT_CODE have been updated to clarify that they are intended for non-differential channels, Signed-off-by: Julien Stephan Reviewed-by: David Lechner Link: https://patch.msgid.link/20241028-iio-add-macro-for-even-identifier-for-differential-channels-v1-1-b452c90f7ea6@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/events.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/events.h b/include/linux/iio/events.h index a4558c45a548..eeaba5e1525e 100644 --- a/include/linux/iio/events.h +++ b/include/linux/iio/events.h @@ -30,7 +30,8 @@ /** - * IIO_MOD_EVENT_CODE() - create event identifier for modified channels + * IIO_MOD_EVENT_CODE() - create event identifier for modified (non + * differential) channels * @chan_type: Type of the channel. Should be one of enum iio_chan_type. * @number: Channel number. * @modifier: Modifier for the channel. Should be one of enum iio_modifier. @@ -43,7 +44,8 @@ IIO_EVENT_CODE(chan_type, 0, modifier, direction, type, number, 0, 0) /** - * IIO_UNMOD_EVENT_CODE() - create event identifier for unmodified channels + * IIO_UNMOD_EVENT_CODE() - create event identifier for unmodified (non + * differential) channels * @chan_type: Type of the channel. Should be one of enum iio_chan_type. * @number: Channel number. * @type: Type of the event. Should be one of enum iio_event_type. @@ -53,4 +55,16 @@ #define IIO_UNMOD_EVENT_CODE(chan_type, number, type, direction) \ IIO_EVENT_CODE(chan_type, 0, 0, direction, type, number, 0, 0) +/** + * IIO_DIFF_EVENT_CODE() - create event identifier for differential channels + * @chan_type: Type of the channel. Should be one of enum iio_chan_type. + * @chan1: First channel number for differential channels. + * @chan2: Second channel number for differential channels. + * @type: Type of the event. Should be one of enum iio_event_type. + * @direction: Direction of the event. One of enum iio_event_direction. + */ + +#define IIO_DIFF_EVENT_CODE(chan_type, chan1, chan2, type, direction) \ + IIO_EVENT_CODE(chan_type, 1, 0, direction, type, 0, chan1, chan2) + #endif -- cgit v1.2.3 From 01f567d22152dfa8799e9fde5f18bbb5650d8681 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 1 Nov 2024 17:17:10 -0500 Subject: iio: events: make IIO_EVENT_CODE macro private Make IIO_EVENT_CODE "private" by adding a leading underscore. There are no more users of this macro in the kernel so we can make it "private" and encourage developers to use the specialized versions of the macro instead. Signed-off-by: David Lechner Link: https://patch.msgid.link/20241101-iio-fix-event-macro-use-v1-3-0000c5d09f6d@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/events.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/events.h b/include/linux/iio/events.h index eeaba5e1525e..72062a0c7c87 100644 --- a/include/linux/iio/events.h +++ b/include/linux/iio/events.h @@ -10,7 +10,7 @@ #include /** - * IIO_EVENT_CODE() - create event identifier + * _IIO_EVENT_CODE() - create event identifier * @chan_type: Type of the channel. Should be one of enum iio_chan_type. * @diff: Whether the event is for an differential channel or not. * @modifier: Modifier for the channel. Should be one of enum iio_modifier. @@ -19,10 +19,13 @@ * @chan: Channel number for non-differential channels. * @chan1: First channel number for differential channels. * @chan2: Second channel number for differential channels. + * + * Drivers should use the specialized macros below instead of using this one + * directly. */ -#define IIO_EVENT_CODE(chan_type, diff, modifier, direction, \ - type, chan, chan1, chan2) \ +#define _IIO_EVENT_CODE(chan_type, diff, modifier, direction, \ + type, chan, chan1, chan2) \ (((u64)type << 56) | ((u64)diff << 55) | \ ((u64)direction << 48) | ((u64)modifier << 40) | \ ((u64)chan_type << 32) | (((u16)chan2) << 16) | ((u16)chan1) | \ @@ -41,7 +44,7 @@ #define IIO_MOD_EVENT_CODE(chan_type, number, modifier, \ type, direction) \ - IIO_EVENT_CODE(chan_type, 0, modifier, direction, type, number, 0, 0) + _IIO_EVENT_CODE(chan_type, 0, modifier, direction, type, number, 0, 0) /** * IIO_UNMOD_EVENT_CODE() - create event identifier for unmodified (non @@ -53,7 +56,7 @@ */ #define IIO_UNMOD_EVENT_CODE(chan_type, number, type, direction) \ - IIO_EVENT_CODE(chan_type, 0, 0, direction, type, number, 0, 0) + _IIO_EVENT_CODE(chan_type, 0, 0, direction, type, number, 0, 0) /** * IIO_DIFF_EVENT_CODE() - create event identifier for differential channels @@ -65,6 +68,6 @@ */ #define IIO_DIFF_EVENT_CODE(chan_type, chan1, chan2, type, direction) \ - IIO_EVENT_CODE(chan_type, 1, 0, direction, type, 0, chan1, chan2) + _IIO_EVENT_CODE(chan_type, 1, 0, direction, type, 0, chan1, chan2) #endif -- cgit v1.2.3 From a865276872ec4f129f8a582634be82dcc275dc2a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 30 Oct 2024 18:23:25 -0600 Subject: dim: make dim_calc_stats() inputs const pointers Make the start and end arguments to dim_calc_stats() const pointers to clarify that the function does not modify their values. Signed-off-by: Caleb Sander Mateos Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Arthur Kiyanovski Link: https://patch.msgid.link/20241031002326.3426181-1-csander@purestorage.com Signed-off-by: Jakub Kicinski --- include/linux/dim.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index 1b581ff25a15..84579a50ae7f 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -351,7 +351,8 @@ void dim_park_tired(struct dim *dim); * Takes into consideration counter wrap-around. * Returned boolean indicates whether curr_stats are reliable. */ -bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end, +bool dim_calc_stats(const struct dim_sample *start, + const struct dim_sample *end, struct dim_stats *curr_stats); /** -- cgit v1.2.3 From 61bf0009a7657d394d942c8ee961b9ea5f2168fe Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 30 Oct 2024 18:23:26 -0600 Subject: dim: pass dim_sample to net_dim() by reference net_dim() is currently passed a struct dim_sample argument by value. struct dim_sample is 24 bytes. Since this is greater 16 bytes, x86-64 passes it on the stack. All callers have already initialized dim_sample on the stack, so passing it by value requires pushing a duplicated copy to the stack. Either witing to the stack and immediately reading it, or perhaps dereferencing addresses relative to the stack pointer in a chain of push instructions, seems to perform quite poorly. In a heavy TCP workload, mlx5e_handle_rx_dim() consumes 3% of CPU time, 94% of which is attributed to the first push instruction to copy dim_sample on the stack for the call to net_dim(): // Call ktime_get() 0.26 |4ead2: call 4ead7 // Pass the address of struct dim in %rdi |4ead7: lea 0x3d0(%rbx),%rdi // Set dim_sample.pkt_ctr |4eade: mov %r13d,0x8(%rsp) // Set dim_sample.byte_ctr |4eae3: mov %r12d,0xc(%rsp) // Set dim_sample.event_ctr 0.15 |4eae8: mov %bp,0x10(%rsp) // Duplicate dim_sample on the stack 94.16 |4eaed: push 0x10(%rsp) 2.79 |4eaf1: push 0x10(%rsp) 0.07 |4eaf5: push %rax // Call net_dim() 0.21 |4eaf6: call 4eafb To allow the caller to reuse the struct dim_sample already on the stack, pass the struct dim_sample by reference to net_dim(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Vladimir Oltean Reviewed-by: Shannon Nelson Reviewed-by: Florian Fainelli Reviewed-by: Arthur Kiyanovski Reviewed-by: Louis Peens Link: https://patch.msgid.link/20241031002326.3426181-2-csander@purestorage.com Signed-off-by: Jakub Kicinski --- include/linux/dim.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index 84579a50ae7f..06543fd40fcc 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -425,7 +425,7 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode); * This is the main logic of the algorithm, where data is processed in order * to decide on next required action. */ -void net_dim(struct dim *dim, struct dim_sample end_sample); +void net_dim(struct dim *dim, const struct dim_sample *end_sample); /* RDMA DIM */ -- cgit v1.2.3 From 1685f685ff8036c74c3b5e9006ec7ceda5bafefa Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 26 Oct 2024 18:43:33 +0300 Subject: soc: qcom: llcc: add support for SAR2130P and SAR1130P Implement necessary support for the LLCC control on the SAR1130P and SAR2130P platforms. These two platforms use different ATTR1_MAX_CAP shift and also require manual override for num_banks. Signed-off-by: Dmitry Baryshkov Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20241026-sar2130p-llcc-v3-3-2a58fa1b4d12@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/llcc-qcom.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 9e9f528b1370..a79bfac230c6 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -54,7 +54,19 @@ #define LLCC_CAMEXP4 52 #define LLCC_DISP_WB 53 #define LLCC_DISP_1 54 +#define LLCC_VIEYE 57 +#define LLCC_VIDPTH 58 +#define LLCC_GPUMV 59 +#define LLCC_EVA_LEFT 60 +#define LLCC_EVA_RIGHT 61 +#define LLCC_EVAGAIN 62 +#define LLCC_VIPTH 63 #define LLCC_VIDVSP 64 +#define LLCC_DISP_LEFT 65 +#define LLCC_DISP_RIGHT 66 +#define LLCC_EVCS_LEFT 67 +#define LLCC_EVCS_RIGHT 68 +#define LLCC_SPAD 69 /** * struct llcc_slice_desc - Cache slice descriptor -- cgit v1.2.3 From 8ab3138a9b2dcb0ddf281240cf8cba414eb1224a Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Tue, 3 Sep 2024 14:37:51 +0300 Subject: net/mlx5: Introduce data placement ordering bits Introduce out-of-order (OOO) data placement (DP) IFC related bits to support OOO DP QP. Signed-off-by: Edward Srouji Reviewed-by: Yishai Hadas Link: https://patch.msgid.link/f30e5cbb5459fd02f27f35909bb545cab346b58b.1725362773.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 96d369112bfa..2a037843b117 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1872,7 +1872,11 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_328[0x2]; u8 relaxed_ordering_read[0x1]; u8 log_max_pd[0x5]; - u8 reserved_at_330[0x5]; + u8 dp_ordering_ooo_all_ud[0x1]; + u8 dp_ordering_ooo_all_uc[0x1]; + u8 dp_ordering_ooo_all_xrc[0x1]; + u8 dp_ordering_ooo_all_dc[0x1]; + u8 dp_ordering_ooo_all_rc[0x1]; u8 pcie_reset_using_hotreset_method[0x1]; u8 pci_sync_for_fw_update_with_driver_unload[0x1]; u8 vnic_env_cnt_steering_fail[0x1]; @@ -2094,7 +2098,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_0[0x80]; u8 migratable[0x1]; - u8 reserved_at_81[0x11]; + u8 reserved_at_81[0x7]; + u8 dp_ordering_force[0x1]; + u8 reserved_at_89[0x9]; u8 query_vuid[0x1]; u8 reserved_at_93[0x5]; u8 umr_log_entity_size_5[0x1]; @@ -3524,7 +3530,8 @@ struct mlx5_ifc_qpc_bits { u8 latency_sensitive[0x1]; u8 reserved_at_24[0x1]; u8 drain_sigerr[0x1]; - u8 reserved_at_26[0x2]; + u8 reserved_at_26[0x1]; + u8 dp_ordering_force[0x1]; u8 pd[0x18]; u8 mtu[0x3]; @@ -3597,7 +3604,8 @@ struct mlx5_ifc_qpc_bits { u8 rae[0x1]; u8 reserved_at_493[0x1]; u8 page_offset[0x6]; - u8 reserved_at_49a[0x3]; + u8 reserved_at_49a[0x2]; + u8 dp_ordering_1[0x1]; u8 cd_slave_receive[0x1]; u8 cd_slave_send[0x1]; u8 cd_master[0x1]; @@ -4507,7 +4515,8 @@ struct mlx5_ifc_dctc_bits { u8 state[0x4]; u8 reserved_at_8[0x18]; - u8 reserved_at_20[0x8]; + u8 reserved_at_20[0x7]; + u8 dp_ordering_force[0x1]; u8 user_index[0x18]; u8 reserved_at_40[0x8]; @@ -4522,7 +4531,9 @@ struct mlx5_ifc_dctc_bits { u8 latency_sensitive[0x1]; u8 rlky[0x1]; u8 free_ar[0x1]; - u8 reserved_at_73[0xd]; + u8 reserved_at_73[0x1]; + u8 dp_ordering_1[0x1]; + u8 reserved_at_75[0xb]; u8 reserved_at_80[0x8]; u8 cs_res[0x8]; -- cgit v1.2.3 From fe5ba6bf91b3e30118c59fe51048cda101de6542 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Wed, 30 Oct 2024 17:39:15 +0800 Subject: net: enetc: add initial netc-blk-ctrl driver support The netc-blk-ctrl driver is used to configure Integrated Endpoint Register Block (IERB) and Privileged Register Block (PRB) of NETC. For i.MX platforms, it is also used to configure the NETCMIX block. The IERB contains registers that are used for pre-boot initialization, debug, and non-customer configuration. The PRB controls global reset and global error handling for NETC. The NETCMIX block is mainly used to set MII protocol and PCS protocol of the links, it also contains settings for some other functions. Note the IERB configuration registers can only be written after being unlocked by PRB, otherwise, all write operations are inhibited. A warm reset is performed when the IERB is unlocked, and it results in an FLR to all NETC devices. Therefore, all NETC device drivers must be probed or initialized after the warm reset is finished. Signed-off-by: Wei Fang Reviewed-by: Frank Li Signed-off-by: David S. Miller --- include/linux/fsl/netc_global.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/fsl/netc_global.h (limited to 'include/linux') diff --git a/include/linux/fsl/netc_global.h b/include/linux/fsl/netc_global.h new file mode 100644 index 000000000000..fdecca8c90f0 --- /dev/null +++ b/include/linux/fsl/netc_global.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* Copyright 2024 NXP + */ +#ifndef __NETC_GLOBAL_H +#define __NETC_GLOBAL_H + +#include + +static inline u32 netc_read(void __iomem *reg) +{ + return ioread32(reg); +} + +static inline void netc_write(void __iomem *reg, u32 val) +{ + iowrite32(val, reg); +} + +#endif -- cgit v1.2.3 From 2a8f6153e1c2db06a537a5c9d61102eb591776f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 08:39:32 +0100 Subject: block: pre-calculate max_zone_append_sectors max_zone_append_sectors differs from all other queue limits in that the final value used is not stored in the queue_limits but needs to be obtained using queue_limits_max_zone_append_sectors helper. This not only adds (tiny) extra overhead to the I/O path, but also can be easily forgotten in file system code. Add a new max_hw_zone_append_sectors value to queue_limits which is set by the driver, and calculate max_zone_append_sectors from that and the other inputs in blk_validate_zoned_limits, similar to how max_sectors is calculated to fix this. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241104073955.112324-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7bfc877e159e..6d1413bd69a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -375,6 +375,7 @@ struct queue_limits { unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; + unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -1204,25 +1205,9 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int -queue_limits_max_zone_append_sectors(const struct queue_limits *l) -{ - unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); - - return min_not_zero(l->max_zone_append_sectors, max_sectors); -} - -static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) -{ - if (!blk_queue_is_zoned(q)) - return 0; - - return queue_limits_max_zone_append_sectors(&q->limits); -} - static inline bool queue_emulates_zone_append(struct request_queue *q) { - return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; + return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; } static inline bool bdev_emulates_zone_append(struct block_device *bdev) @@ -1233,7 +1218,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev) static inline unsigned int bdev_max_zone_append_sectors(struct block_device *bdev) { - return queue_max_zone_append_sectors(bdev_get_queue(bdev)); + return bdev_limits(bdev)->max_zone_append_sectors; } static inline unsigned int bdev_max_segments(struct block_device *bdev) -- cgit v1.2.3 From 9a783139614fb837da4ccb2f8ec6f0ddc802b3d3 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Mon, 4 Nov 2024 16:03:00 +1000 Subject: bpf: Move btf_type_is_struct_ptr() under CONFIG_BPF_SYSCALL The static inline btf_type_is_struct_ptr() function calls btf_type_skip_modifiers() which is guarded by CONFIG_BPF_SYSCALL. btf_type_is_struct_ptr() is also only called by CONFIG_BPF_SYSCALL ifdef code, so let's only expose btf_type_is_struct_ptr() if CONFIG_BPF_SYSCALL is defined. Signed-off-by: Alistair Francis Link: https://lore.kernel.org/r/20241104060300.421403-1-alistair.francis@wdc.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 631060e3ad14..4214e76c9168 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -582,6 +582,16 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx); + +static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) +{ + if (!btf_type_is_ptr(t)) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + return btf_type_is_struct(t); +} #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -661,15 +671,4 @@ static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *fun return -EOPNOTSUPP; } #endif - -static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) -{ - if (!btf_type_is_ptr(t)) - return false; - - t = btf_type_skip_modifiers(btf, t->type, NULL); - - return btf_type_is_struct(t); -} - #endif -- cgit v1.2.3 From cb4158ce8ec8a5bb528cc1693356a5eb8058094d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 4 Nov 2024 09:19:57 -0800 Subject: bpf: Mark raw_tp arguments with PTR_MAYBE_NULL Arguments to a raw tracepoint are tagged as trusted, which carries the semantics that the pointer will be non-NULL. However, in certain cases, a raw tracepoint argument may end up being NULL. More context about this issue is available in [0]. Thus, there is a discrepancy between the reality, that raw_tp arguments can actually be NULL, and the verifier's knowledge, that they are never NULL, causing explicit NULL checks to be deleted, and accesses to such pointers potentially crashing the kernel. To fix this, mark raw_tp arguments as PTR_MAYBE_NULL, and then special case the dereference and pointer arithmetic to permit it, and allow passing them into helpers/kfuncs; these exceptions are made for raw_tp programs only. Ensure that we don't do this when ref_obj_id > 0, as in that case this is an acquired object and doesn't need such adjustment. The reason we do mask_raw_tp_trusted_reg logic is because other will recheck in places whether the register is a trusted_reg, and then consider our register as untrusted when detecting the presence of the PTR_MAYBE_NULL flag. To allow safe dereference, we enable PROBE_MEM marking when we see loads into trusted pointers with PTR_MAYBE_NULL. While trusted raw_tp arguments can also be passed into helpers or kfuncs where such broken assumption may cause issues, a future patch set will tackle their case separately, as PTR_TO_BTF_ID (without PTR_TRUSTED) can already be passed into helpers and causes similar problems. Thus, they are left alone for now. It is possible that these checks also permit passing non-raw_tp args that are trusted PTR_TO_BTF_ID with null marking. In such a case, allowing dereference when pointer is NULL expands allowed behavior, so won't regress existing programs, and the case of passing these into helpers is the same as above and will be dealt with later. Also update the failure case in tp_btf_nullable selftest to capture the new behavior, as the verifier will no longer cause an error when directly dereference a raw tracepoint argument marked as __nullable. [0]: https://lore.kernel.org/bpf/ZrCZS6nisraEqehw@jlelli-thinkpadt14gen4.remote.csb Reviewed-by: Jiri Olsa Reported-by: Juri Lelli Tested-by: Juri Lelli Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241104171959.2938862-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3ba4d475174..1b84613b10ac 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3495,4 +3495,10 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } +static inline bool bpf_prog_is_raw_tp(const struct bpf_prog *prog) +{ + return prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP; +} + #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From ad37bcd965fda43f34cf5cc051f5d310880bd1e7 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 30 Oct 2024 16:04:25 +0000 Subject: rust: add tracepoint support Make it possible to have Rust code call into tracepoints defined by C code. It is still required that the tracepoint is declared in a C header, and that this header is included in the input to bindgen. Instead of calling __DO_TRACE directly, the exported rust_do_trace_ function calls an inline helper function. This is because the `cond` argument does not exist at the callsite of DEFINE_RUST_DO_TRACE. __DECLARE_TRACE always emits an inline static and an extern declaration that is only used when CREATE_RUST_TRACE_POINTS is set. These should not end up in the final binary so it is not a problem that they sometimes are emitted without a user. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Josh Poimboeuf Cc: Jason Baron Cc: Ard Biesheuvel Cc: Miguel Ojeda Cc: Alex Gaynor Cc: Wedson Almeida Filho Cc: " =?utf-8?q?Bj=C3=B6rn_Roy_Baron?= " Cc: Benno Lossin Cc: Andreas Hindborg Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Sean Christopherson Cc: Uros Bizjak Cc: Catalin Marinas Cc: Will Deacon Cc: Marc Zyngier Cc: Oliver Upton Cc: Mark Rutland Cc: Ryan Roberts Cc: Fuad Tabba Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Anup Patel Cc: Andrew Jones Cc: Alexandre Ghiti Cc: Conor Dooley Cc: Samuel Holland Cc: Huacai Chen Cc: WANG Xuerui Cc: Bibo Mao Cc: Tiezhu Yang Cc: Andrew Morton Cc: Tianrui Zhao Link: https://lore.kernel.org/20241030-tracepoint-v12-2-eec7f0f8ad22@google.com Reviewed-by: Carlos Llamas Reviewed-by: Gary Guo Reviewed-by: Boqun Feng Signed-off-by: Alice Ryhl Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 0dc67fad706c..84c4924e499f 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -225,6 +225,18 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) preempt_enable_notrace(); \ } while (0) +/* + * Declare an exported function that Rust code can call to trigger this + * tracepoint. This function does not include the static branch; that is done + * in Rust to avoid a function call when the tracepoint is disabled. + */ +#define DEFINE_RUST_DO_TRACE(name, proto, args) +#define __DEFINE_RUST_DO_TRACE(name, proto, args) \ + notrace void rust_do_trace_##name(proto) \ + { \ + __rust_do_trace_##name(args); \ + } + /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -240,6 +252,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ + extern void rust_do_trace_##name(proto); \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -271,6 +284,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void __rust_do_trace_##name(proto) \ + { \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 0); \ + } \ static inline void trace_##name(proto) \ { \ if (static_branch_unlikely(&__tracepoint_##name.key)) \ @@ -285,6 +304,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + static inline void __rust_do_trace_##name(proto) \ + { \ + __DO_TRACE(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond), 1); \ + } \ static inline void trace_##name(proto) \ { \ if (static_branch_unlikely(&__tracepoint_##name.key)) \ @@ -339,7 +364,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) void __probestub_##_name(void *__data, proto) \ { \ } \ - DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); + DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); \ + DEFINE_RUST_DO_TRACE(_name, TP_PROTO(proto), TP_ARGS(args)) #define DEFINE_TRACE(name, proto, args) \ DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args)); -- cgit v1.2.3 From 5609296750afd6462a4d994b6803ccc5e8bf1d4e Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 30 Oct 2024 16:39:51 +0000 Subject: PM: EM: Add min/max available performance state limits On some devices there are HW dependencies for shared frequency and voltage between devices. It will impact Energy Aware Scheduler (EAS) decision, where CPUs share the voltage & frequency domain with other CPUs or devices e.g. - Mid CPUs + Big CPU - Little CPU + L3 cache in DSU - some other device + Little CPUs Detailed explanation of one example: When the L3 cache frequency is increased, the affected Little CPUs might run at higher voltage and frequency. That higher voltage causes higher CPU power and thus more energy is used for running the tasks. This is important for background running tasks, which try to run on energy efficient CPUs. Therefore, add performance state limits which are applied for the device (in this case CPU). This is important on SoCs with HW dependencies mentioned above so that the Energy Aware Scheduler (EAS) does not use performance states outside the valid min-max range for energy calculation. Signed-off-by: Lukasz Luba Link: https://patch.msgid.link/20241030164126.1263793-2-lukasz.luba@arm.com Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 1ff52020cf75..752e0b297582 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -55,6 +55,8 @@ struct em_perf_table { * struct em_perf_domain - Performance domain * @em_table: Pointer to the runtime modifiable em_perf_table * @nr_perf_states: Number of performance states + * @min_perf_state: Minimum allowed Performance State index + * @max_perf_state: Maximum allowed Performance State index * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache @@ -70,6 +72,8 @@ struct em_perf_table { struct em_perf_domain { struct em_perf_table __rcu *em_table; int nr_perf_states; + int min_perf_state; + int max_perf_state; unsigned long flags; unsigned long cpus[]; }; @@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); int em_dev_update_chip_binning(struct device *dev); +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM * @table: List of performance states, in ascending order - * @nr_perf_states: Number of performance states + * @pd: performance domain for which this must be done * @max_util: Max utilization to map with the EM - * @pd_flags: Performance Domain flags * * It is called from the scheduler code quite frequently and as a consequence * doesn't implement any check. @@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev); * requirement. */ static inline int -em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, - unsigned long max_util, unsigned long pd_flags) +em_pd_get_efficient_state(struct em_perf_state *table, + struct em_perf_domain *pd, unsigned long max_util) { + unsigned long pd_flags = pd->flags; + int min_ps = pd->min_perf_state; + int max_ps = pd->max_perf_state; struct em_perf_state *ps; int i; - for (i = 0; i < nr_perf_states; i++) { + for (i = min_ps; i <= max_ps; i++) { ps = &table[i]; if (ps->performance >= max_util) { if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && @@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, } } - return nr_perf_states - 1; + return max_ps; } /** @@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * requested performance. */ em_table = rcu_dereference(pd->em_table); - i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states, - max_util, pd->flags); + i = em_pd_get_efficient_state(em_table->state, pd, max_util); ps = &em_table->state[i]; /* @@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev) { return -EINVAL; } +static inline +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + return -EINVAL; +} #endif #endif -- cgit v1.2.3 From 9e0933c21c128d6d8ac4d8aae0babaf9a43100b8 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 4 Nov 2024 16:14:02 -0800 Subject: fs: iomap: Atomic write support Support direct I/O atomic writes by producing a single bio with REQ_ATOMIC flag set. Initially FSes (XFS) should only support writing a single FS block atomically. As with any atomic write, we should produce a single bio which covers the complete write length. Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Reviewed-by: Ritesh Harjani (IBM) [djwong: clarify a couple of things in the docs] Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4ad12a3c8bae..c7644bdcfca3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -178,6 +178,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ +#define IOMAP_ATOMIC (1 << 9) struct iomap_ops { /* -- cgit v1.2.3 From 0a2cdeeae9ddc14d752173be6af98bc9fb45c6ad Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 4 Nov 2024 15:00:41 +0800 Subject: net: tcp: replace the document for "lsndtime" in tcp_sock Commit d5fed5addb2b ("tcp: reorganize tcp_sock fast path variables") moved the fields around and misplaced the documentation for "lsndtime". So, let's replace it in the proper place. Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241104070041.64302-1-dongml2@chinatelecom.cn Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6a5e08b937b3..f88daaa76d83 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -200,7 +200,6 @@ struct tcp_sock { /* TX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_tx); - /* timestamp of last sent data packet (for restart window) */ u32 max_window; /* Maximal window ever seen from peer */ u32 rcv_ssthresh; /* Current window clamp */ u32 reordering; /* Packet reordering metric. */ @@ -263,7 +262,7 @@ struct tcp_sock { u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ - u32 lsndtime; + u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 mdev_us; /* medium deviation */ u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ -- cgit v1.2.3 From f248ff14b7589306c8af922465aefedf9b10fa9e Mon Sep 17 00:00:00 2001 From: Desnes Nunes Date: Thu, 31 Oct 2024 11:28:00 -0300 Subject: misc: rtsx: Cleanup on DRV_NAME cardreader variables The rtsx_pci_ms memstick driver has been dropped, thus there is no more need for DRV_NAME_RTSX_PCI_MS variable. Additionally, this also stand- arizes DRV_NAME variables on alcor_pci and rtsx_usb drivers. Fixes: d0f459259c13 ("memstick: rtsx_pci_ms: Remove Realtek PCI memstick driver") Signed-off-by: Desnes Nunes Link: https://lore.kernel.org/r/20241031142801.1141680-1-desnesn@redhat.com Signed-off-by: Greg Kroah-Hartman --- include/linux/alcor_pci.h | 1 + include/linux/rtsx_common.h | 1 - include/linux/rtsx_usb.h | 4 ++++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alcor_pci.h b/include/linux/alcor_pci.h index c4a0b23846d8..dcb1d37dabc2 100644 --- a/include/linux/alcor_pci.h +++ b/include/linux/alcor_pci.h @@ -11,6 +11,7 @@ #define ALCOR_SD_CARD 0 #define ALCOR_MS_CARD 1 +#define DRV_NAME_ALCOR_PCI "alcor_pci" #define DRV_NAME_ALCOR_PCI_SDMMC "alcor_sdmmc" #define DRV_NAME_ALCOR_PCI_MS "alcor_ms" diff --git a/include/linux/rtsx_common.h b/include/linux/rtsx_common.h index bf290ad14c57..da9c8c6b5d50 100644 --- a/include/linux/rtsx_common.h +++ b/include/linux/rtsx_common.h @@ -12,7 +12,6 @@ #define DRV_NAME_RTSX_PCI "rtsx_pci" #define DRV_NAME_RTSX_PCI_SDMMC "rtsx_pci_sdmmc" -#define DRV_NAME_RTSX_PCI_MS "rtsx_pci_ms" #define RTSX_REG_PAIR(addr, val) (((u32)(addr) << 16) | (u8)(val)) diff --git a/include/linux/rtsx_usb.h b/include/linux/rtsx_usb.h index 3247ed8e9ff0..f267a06c6b1e 100644 --- a/include/linux/rtsx_usb.h +++ b/include/linux/rtsx_usb.h @@ -12,6 +12,10 @@ #include +#define DRV_NAME_RTSX_USB "rtsx_usb" +#define DRV_NAME_RTSX_USB_SDMMC "rtsx_usb_sdmmc" +#define DRV_NAME_RTSX_USB_MS "rtsx_usb_ms" + /* related module names */ #define RTSX_USB_SD_CARD 0 #define RTSX_USB_MS_CARD 1 -- cgit v1.2.3 From 076354a4d4a73cb792a680a7f40f603c9b145a76 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Fri, 18 Oct 2024 10:58:20 -0300 Subject: watchdog: da9063: Do not use a global variable Using the 'use_sw_pm' variable as global is not recommended as it prevents multi instances of the driver to run. Make it a member of the da9063 structure instead. Signed-off-by: Fabio Estevam Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20241018135821.274376-1-festevam@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/mfd/da9063/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/da9063/core.h b/include/linux/mfd/da9063/core.h index 8db52324f416..eae82f421414 100644 --- a/include/linux/mfd/da9063/core.h +++ b/include/linux/mfd/da9063/core.h @@ -78,6 +78,7 @@ struct da9063 { enum da9063_type type; unsigned char variant_code; unsigned int flags; + bool use_sw_pm; /* Control interface */ struct regmap *regmap; -- cgit v1.2.3 From 7fce207af5ec074a9a50e90eb866b17ca4a90f06 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:07 -0700 Subject: mm/writeback: add folio_mark_dirty_lock() Add a new convenience helper folio_mark_dirty_lock() that grabs the folio lock before calling folio_mark_dirty(). Refactor set_page_dirty_lock() to directly use folio_mark_dirty_lock(). Signed-off-by: Joanne Koong Signed-off-by: Miklos Szeredi --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ecf63d2b0582..446d7096c48f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2539,6 +2539,7 @@ struct kvec; struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); +bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -- cgit v1.2.3 From 35890f85573c2ebbbf3491dc66f7ee2ad63055af Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:45 -0300 Subject: vfio: Remove VFIO_TYPE1_NESTING_IOMMU This control causes the ARM SMMU drivers to choose a stage 2 implementation for the IO pagetable (vs the stage 1 usual default), however this choice has no significant visible impact to the VFIO user. Further qemu never implemented this and no other userspace user is known. The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide SMMU translation services to the guest operating system" however the rest of the API to set the guest table pointer for the stage 1 and manage invalidation was never completed, or at least never upstreamed, rendering this part useless dead code. Upstream has now settled on iommufd as the uAPI for controlling nested translation. Choosing the stage 2 implementation should be done by through the IOMMU_HWPT_ALLOC_NEST_PARENT flag during domain allocation. Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the enable_nesting iommu_domain_op. Just in-case there is some userspace using this continue to treat requesting it as a NOP, but do not advertise support any more. Acked-by: Alex Williamson Reviewed-by: Mostafa Saleh Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bd722f473635..c88d18d2c928 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -635,7 +635,6 @@ struct iommu_ops { * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, * including no-snoop TLPs on PCIe or other platform * specific mechanisms. - * @enable_nesting: Enable nesting * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. */ @@ -663,7 +662,6 @@ struct iommu_domain_ops { dma_addr_t iova); bool (*enforce_cache_coherency)(struct iommu_domain *domain); - int (*enable_nesting)(struct iommu_domain *domain); int (*set_pgtable_quirks)(struct iommu_domain *domain, unsigned long quirks); @@ -844,7 +842,6 @@ extern void iommu_group_put(struct iommu_group *group); extern int iommu_group_id(struct iommu_group *group); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); -int iommu_enable_nesting(struct iommu_domain *domain); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); -- cgit v1.2.3 From 807404d66fcf898d4bcc6a3e3edb07ffd5b88400 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 30 Oct 2024 21:20:47 -0300 Subject: ACPI/IORT: Support CANWBS memory access flag The IORT spec, Issue E.f (April 2024), adds a new CANWBS bit to the Memory Access Flag field in the Memory Access Properties table, mainly for a PCI Root Complex. This CANWBS defines the coherency of memory accesses to be not marked IOWB cacheable/shareable. Its value further implies the coherency impact from a pair of mismatched memory attributes (e.g. in a nested translation case): 0x0: Use of mismatched memory attributes for accesses made by this device may lead to a loss of coherency. 0x1: Coherency of accesses made by this device to locations in Conventional memory are ensured as follows, even if the memory attributes for the accesses presented by the device or provided by the SMMU are different from Inner and Outer Write-back cacheable, Shareable. Note that the loss of coherency on a CANWBS-unsupported HW typically could occur to an SMMU that doesn't implement the S2FWB feature where additional cache flush operations would be required to prevent that from happening. Add a new ACPI_IORT_MF_CANWBS flag and set IOMMU_FWSPEC_PCI_RC_CANWBS upon the presence of this new flag. CANWBS and S2FWB are similar features, in that they both guarantee the VM can not violate coherency, however S2FWB can be bypassed by PCI No Snoop TLPs, while CANWBS cannot. Thus CANWBS meets the requirements to set IOMMU_CAP_ENFORCE_CACHE_COHERENCY. Architecturally ARM has expected that VFIO would disable No Snoop through PCI Config space, if this is done then the two would have the same protections. Tested-by: Nicolin Chen Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Acked-by: Hanjun Guo Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Signed-off-by: Will Deacon --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c88d18d2c928..4ad9b9ec6c9b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -991,6 +991,8 @@ struct iommu_fwspec { /* ATS is supported */ #define IOMMU_FWSPEC_PCI_RC_ATS (1 << 0) +/* CANWBS is supported */ +#define IOMMU_FWSPEC_PCI_RC_CANWBS (1 << 1) /* * An iommu attach handle represents a relationship between an iommu domain -- cgit v1.2.3 From 5c1806c41ce0a0110db5dd4c483cf2dc28b3ddf0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 4 Nov 2024 16:43:07 +0100 Subject: kcsan, seqlock: Support seqcount_latch_t While fuzzing an arm64 kernel, Alexander Potapenko reported: | BUG: KCSAN: data-race in ktime_get_mono_fast_ns / timekeeping_update | | write to 0xffffffc082e74248 of 56 bytes by interrupt on cpu 0: | update_fast_timekeeper kernel/time/timekeeping.c:430 [inline] | timekeeping_update+0x1d8/0x2d8 kernel/time/timekeeping.c:768 | timekeeping_advance+0x9e8/0xb78 kernel/time/timekeeping.c:2344 | update_wall_time+0x18/0x38 kernel/time/timekeeping.c:2360 | [...] | | read to 0xffffffc082e74258 of 8 bytes by task 5260 on cpu 1: | __ktime_get_fast_ns kernel/time/timekeeping.c:372 [inline] | ktime_get_mono_fast_ns+0x88/0x174 kernel/time/timekeeping.c:489 | init_srcu_struct_fields+0x40c/0x530 kernel/rcu/srcutree.c:263 | init_srcu_struct+0x14/0x20 kernel/rcu/srcutree.c:311 | [...] | | value changed: 0x000002f875d33266 -> 0x000002f877416866 | | Reported by Kernel Concurrency Sanitizer on: | CPU: 1 UID: 0 PID: 5260 Comm: syz.2.7483 Not tainted 6.12.0-rc3-dirty #78 This is a false positive data race between a seqcount latch writer and a reader accessing stale data. Since its introduction, KCSAN has never understood the seqcount_latch interface (due to being unannotated). Unlike the regular seqlock interface, the seqcount_latch interface for latch writers never has had a well-defined critical section, making it difficult to teach tooling where the critical section starts and ends. Introduce an instrumentable (non-raw) seqcount_latch interface, with which we can clearly denote writer critical sections. This both helps readability and tooling like KCSAN to understand when the writer is done updating all latch copies. Fixes: 88ecd153be95 ("seqlock, kcsan: Add annotations for KCSAN") Reported-by: Alexander Potapenko Co-developed-by: "Peter Zijlstra (Intel)" Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241104161910.780003-4-elver@google.com --- include/linux/seqlock.h | 86 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index fffeb754880f..45eee0e5dca0 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -621,6 +621,23 @@ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t * return READ_ONCE(s->seqcount.sequence); } +/** + * read_seqcount_latch() - pick even/odd latch data copy + * @s: Pointer to seqcount_latch_t + * + * See write_seqcount_latch() for details and a full reader/writer usage + * example. + * + * Return: sequence counter raw value. Use the lowest bit as an index for + * picking which data copy to read. The full counter must then be checked + * with read_seqcount_latch_retry(). + */ +static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s) +{ + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); + return raw_read_seqcount_latch(s); +} + /** * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t @@ -635,9 +652,34 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) return unlikely(READ_ONCE(s->seqcount.sequence) != start); } +/** + * read_seqcount_latch_retry() - end a seqcount_latch_t read section + * @s: Pointer to seqcount_latch_t + * @start: count, from read_seqcount_latch() + * + * Return: true if a read section retry is required, else false + */ +static __always_inline int +read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) +{ + kcsan_atomic_next(0); + return raw_read_seqcount_latch_retry(s, start); +} + /** * raw_write_seqcount_latch() - redirect latch readers to even/odd copy * @s: Pointer to seqcount_latch_t + */ +static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s) +{ + smp_wmb(); /* prior stores before incrementing "sequence" */ + s->seqcount.sequence++; + smp_wmb(); /* increment "sequence" before following stores */ +} + +/** + * write_seqcount_latch_begin() - redirect latch readers to odd copy + * @s: Pointer to seqcount_latch_t * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never @@ -665,17 +707,11 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) * * void latch_modify(struct latch_struct *latch, ...) * { - * smp_wmb(); // Ensure that the last data[1] update is visible - * latch->seq.sequence++; - * smp_wmb(); // Ensure that the seqcount update is visible - * + * write_seqcount_latch_begin(&latch->seq); * modify(latch->data[0], ...); - * - * smp_wmb(); // Ensure that the data[0] update is visible - * latch->seq.sequence++; - * smp_wmb(); // Ensure that the seqcount update is visible - * + * write_seqcount_latch(&latch->seq); * modify(latch->data[1], ...); + * write_seqcount_latch_end(&latch->seq); * } * * The query will have a form like:: @@ -686,13 +722,13 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) * unsigned seq, idx; * * do { - * seq = raw_read_seqcount_latch(&latch->seq); + * seq = read_seqcount_latch(&latch->seq); * * idx = seq & 0x01; * entry = data_query(latch->data[idx], ...); * * // This includes needed smp_rmb() - * } while (raw_read_seqcount_latch_retry(&latch->seq, seq)); + * } while (read_seqcount_latch_retry(&latch->seq, seq)); * * return entry; * } @@ -716,11 +752,31 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) * When data is a dynamic data structure; one should use regular RCU * patterns to manage the lifetimes of the objects within. */ -static inline void raw_write_seqcount_latch(seqcount_latch_t *s) +static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s) { - smp_wmb(); /* prior stores before incrementing "sequence" */ - s->seqcount.sequence++; - smp_wmb(); /* increment "sequence" before following stores */ + kcsan_nestable_atomic_begin(); + raw_write_seqcount_latch(s); +} + +/** + * write_seqcount_latch() - redirect latch readers to even copy + * @s: Pointer to seqcount_latch_t + */ +static __always_inline void write_seqcount_latch(seqcount_latch_t *s) +{ + raw_write_seqcount_latch(s); +} + +/** + * write_seqcount_latch_end() - end a seqcount_latch_t write section + * @s: Pointer to seqcount_latch_t + * + * Marks the end of a seqcount_latch_t writer section, after all copies of the + * latch-protected data have been updated. + */ +static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s) +{ + kcsan_nestable_atomic_end(); } #define __SEQLOCK_UNLOCKED(lockname) \ -- cgit v1.2.3 From 93190bc35d6d4364a4d8c38ac8961dabecbff4ed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 4 Nov 2024 16:43:08 +0100 Subject: seqlock, treewide: Switch to non-raw seqcount_latch interface Switch all instrumentable users of the seqcount_latch interface over to the non-raw interface. Co-developed-by: "Peter Zijlstra (Intel)" Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241104161910.780003-5-elver@google.com --- include/linux/rbtree_latch.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h index 6a0999c26c7c..2f630eb8307e 100644 --- a/include/linux/rbtree_latch.h +++ b/include/linux/rbtree_latch.h @@ -14,7 +14,7 @@ * * If we need to allow unconditional lookups (say as required for NMI context * usage) we need a more complex setup; this data structure provides this by - * employing the latch technique -- see @raw_write_seqcount_latch -- to + * employing the latch technique -- see @write_seqcount_latch_begin -- to * implement a latched RB-tree which does allow for unconditional lookups by * virtue of always having (at least) one stable copy of the tree. * @@ -132,7 +132,7 @@ __lt_find(void *key, struct latch_tree_root *ltr, int idx, * @ops: operators defining the node order * * It inserts @node into @root in an ordered fashion such that we can always - * observe one complete tree. See the comment for raw_write_seqcount_latch(). + * observe one complete tree. See the comment for write_seqcount_latch_begin(). * * The inserts use rcu_assign_pointer() to publish the element such that the * tree structure is stored before we can observe the new @node. @@ -145,10 +145,11 @@ latch_tree_insert(struct latch_tree_node *node, struct latch_tree_root *root, const struct latch_tree_ops *ops) { - raw_write_seqcount_latch(&root->seq); + write_seqcount_latch_begin(&root->seq); __lt_insert(node, root, 0, ops->less); - raw_write_seqcount_latch(&root->seq); + write_seqcount_latch(&root->seq); __lt_insert(node, root, 1, ops->less); + write_seqcount_latch_end(&root->seq); } /** @@ -159,7 +160,7 @@ latch_tree_insert(struct latch_tree_node *node, * * Removes @node from the trees @root in an ordered fashion such that we can * always observe one complete tree. See the comment for - * raw_write_seqcount_latch(). + * write_seqcount_latch_begin(). * * It is assumed that @node will observe one RCU quiescent state before being * reused of freed. @@ -172,10 +173,11 @@ latch_tree_erase(struct latch_tree_node *node, struct latch_tree_root *root, const struct latch_tree_ops *ops) { - raw_write_seqcount_latch(&root->seq); + write_seqcount_latch_begin(&root->seq); __lt_erase(node, root, 0); - raw_write_seqcount_latch(&root->seq); + write_seqcount_latch(&root->seq); __lt_erase(node, root, 1); + write_seqcount_latch_end(&root->seq); } /** @@ -204,9 +206,9 @@ latch_tree_find(void *key, struct latch_tree_root *root, unsigned int seq; do { - seq = raw_read_seqcount_latch(&root->seq); + seq = read_seqcount_latch(&root->seq); node = __lt_find(key, root, seq & 1, ops->comp); - } while (raw_read_seqcount_latch_retry(&root->seq, seq)); + } while (read_seqcount_latch_retry(&root->seq, seq)); return node; } -- cgit v1.2.3 From 183ec5f26b2fc97a4a9871865bfe9b33c41fddb2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 4 Nov 2024 16:43:09 +0100 Subject: kcsan, seqlock: Fix incorrect assumption in read_seqbegin() During testing of the preceding changes, I noticed that in some cases, current->kcsan_ctx.in_flat_atomic remained true until task exit. This is obviously wrong, because _all_ accesses for the given task will be treated as atomic, resulting in false negatives i.e. missed data races. Debugging led to fs/dcache.c, where we can see this usage of seqlock: struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); [...] As can be seen, read_seqretry() is never called if dentry != NULL; consequently, current->kcsan_ctx.in_flat_atomic will never be reset to false by read_seqretry(). Give up on the wrong assumption of "assume closing read_seqretry()", and rely on the already-present annotations in read_seqcount_begin/retry(). Fixes: 88ecd153be95 ("seqlock, kcsan: Add annotations for KCSAN") Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241104161910.780003-6-elver@google.com --- include/linux/seqlock.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 45eee0e5dca0..5298765d6ca4 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -810,11 +810,7 @@ static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s) */ static inline unsigned read_seqbegin(const seqlock_t *sl) { - unsigned ret = read_seqcount_begin(&sl->seqcount); - - kcsan_atomic_next(0); /* non-raw usage, assume closing read_seqretry() */ - kcsan_flat_atomic_begin(); - return ret; + return read_seqcount_begin(&sl->seqcount); } /** @@ -830,12 +826,6 @@ static inline unsigned read_seqbegin(const seqlock_t *sl) */ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { - /* - * Assume not nested: read_seqretry() may be called multiple times when - * completing read critical section. - */ - kcsan_flat_atomic_end(); - return read_seqcount_retry(&sl->seqcount, start); } -- cgit v1.2.3 From 0f0d1b8e5010bfe1feeb4d78d137e41946a5370d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Oct 2024 14:20:35 +0100 Subject: sched/ext: Remove sched_fork() hack Instead of solving the underlying problem of the double invocation of __sched_fork() for idle tasks, sched-ext decided to hack around the issue by partially clearing out the entity struct to preserve the already enqueued node. A provided analysis and solution has been ignored for four months. Now that someone else has taken care of cleaning it up, remove the disgusting hack and clear out the full structure. Remove the comment in the structure declaration as well, as there is no requirement for @node being the last element anymore. Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Link: https://lore.kernel.org/r/87ldy82wkc.ffs@tglx --- include/linux/sched/ext.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1ddbde64a31b..2799e7284fff 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -199,7 +199,6 @@ struct sched_ext_entity { #ifdef CONFIG_EXT_GROUP_SCHED struct cgroup *cgrp_moving_from; #endif - /* must be the last field, see init_scx_entity() */ struct list_head tasks_node; }; -- cgit v1.2.3 From 26baa1f1c4bdc34b8d698c1900b407d863ad0e69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2024 14:47:02 +0200 Subject: sched: Add TIF_NEED_RESCHED_LAZY infrastructure Add the basic infrastructure to split the TIF_NEED_RESCHED bit in two. Either bit will cause a resched on return-to-user, but only TIF_NEED_RESCHED will drive IRQ preemption. No behavioural change intended. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20241007075055.219540785@infradead.org --- include/linux/entry-common.h | 3 ++- include/linux/entry-kvm.h | 5 +++-- include/linux/sched.h | 3 ++- include/linux/thread_info.h | 21 +++++++++++++++++---- 4 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 1e50cdb83ae5..fc61d0205c97 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -64,7 +64,8 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 6813171afccb..16149f6625e4 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -17,8 +17,9 @@ #endif #define XFER_TO_GUEST_MODE_WORK \ - (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ - _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) + (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ + _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ + ARCH_XFER_TO_GUEST_MODE_WORK) struct kvm_vcpu; diff --git a/include/linux/sched.h b/include/linux/sched.h index a76e3d074a2a..1d5cc3e50884 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2002,7 +2002,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk) static inline void clear_tsk_need_resched(struct task_struct *tsk) { - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); + atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, + (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 9ea0b28068f4..cf2446c9c30d 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -59,6 +59,14 @@ enum syscall_work_bit { #include +#ifndef TIF_NEED_RESCHED_LAZY +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY +#error Inconsistent PREEMPT_LAZY +#endif +#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED +#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED +#endif + #ifdef __KERNEL__ #ifndef arch_set_restart_data @@ -179,22 +187,27 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H -static __always_inline bool tif_need_resched(void) +static __always_inline bool tif_test_bit(int bit) { - return arch_test_bit(TIF_NEED_RESCHED, + return arch_test_bit(bit, (unsigned long *)(¤t_thread_info()->flags)); } #else -static __always_inline bool tif_need_resched(void) +static __always_inline bool tif_test_bit(int bit) { - return test_bit(TIF_NEED_RESCHED, + return test_bit(bit, (unsigned long *)(¤t_thread_info()->flags)); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ +static __always_inline bool tif_need_resched(void) +{ + return tif_test_bit(TIF_NEED_RESCHED); +} + #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, -- cgit v1.2.3 From 7c70cb94d29cd325fabe4a818c18613e3b9919a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2024 14:46:58 +0200 Subject: sched: Add Lazy preemption model Change fair to use resched_curr_lazy(), which, when the lazy preemption model is selected, will set TIF_NEED_RESCHED_LAZY. This LAZY bit will be promoted to the full NEED_RESCHED bit on tick. As such, the average delay between setting LAZY and actually rescheduling will be TICK_NSEC/2. In short, Lazy preemption will delay preemption for fair class but will function as Full preemption for all the other classes, most notably the realtime (RR/FIFO/DEADLINE) classes. The goal is to bridge the performance gap with Voluntary, such that we might eventually remove that option entirely. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20241007075055.331243614@infradead.org --- include/linux/preempt.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index ce76f1a45722..ca86235ac15c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); +extern bool preempt_model_lazy(void); #else @@ -502,6 +503,11 @@ static inline bool preempt_model_full(void) return IS_ENABLED(CONFIG_PREEMPT); } +static inline bool preempt_model_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} + #endif static inline bool preempt_model_rt(void) @@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void) */ static inline bool preempt_model_preemptible(void) { - return preempt_model_full() || preempt_model_rt(); + return preempt_model_full() || preempt_model_lazy() || preempt_model_rt(); } #endif /* __LINUX_PREEMPT_H */ -- cgit v1.2.3 From 18d92bb57c39504d9da11c6ef604f58eb1d5a117 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 22 Oct 2024 18:59:08 +0300 Subject: perf/core: Add aux_pause, aux_resume, aux_start_paused Hardware traces, such as instruction traces, can produce a vast amount of trace data, so being able to reduce tracing to more specific circumstances can be useful. The ability to pause or resume tracing when another event happens, can do that. Add ability for an event to "pause" or "resume" AUX area tracing. Add aux_pause bit to perf_event_attr to indicate that, if the event happens, the associated AUX area tracing should be paused. Ditto aux_resume. Do not allow aux_pause and aux_resume to be set together. Add aux_start_paused bit to perf_event_attr to indicate to an AUX area event that it should start in a "paused" state. Add aux_paused to struct hw_perf_event for AUX area events to keep track of the "paused" state. aux_paused is initialized to aux_start_paused. Add PERF_EF_PAUSE and PERF_EF_RESUME modes for ->stop() and ->start() callbacks. Call as needed, during __perf_event_output(). Add aux_in_pause_resume to struct perf_buffer to prevent races with the NMI handler. Pause/resume in NMI context will miss out if it coincides with another pause/resume. To use aux_pause or aux_resume, an event must be in a group with the AUX area event as the group leader. Example (requires Intel PT and tools patches also): $ perf record --kcore -e intel_pt/aux-action=start-paused/k,syscalls:sys_enter_newuname/aux-action=resume/,syscalls:sys_exit_newuname/aux-action=pause/ uname Linux [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.043 MB perf.data ] $ perf script --call-trace uname 30805 [000] 24001.058782799: name: 0x7ffc9c1865b0 uname 30805 [000] 24001.058784424: psb offs: 0 uname 30805 [000] 24001.058784424: cbr: 39 freq: 3904 MHz (139%) uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) debug_smp_processor_id uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __x64_sys_newuname uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) down_read uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) __cond_resched uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) in_lock_functions uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_sub uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) up_read uname 30805 [000] 24001.058784629: ([kernel.kallsyms]) preempt_count_add uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) in_lock_functions uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) preempt_count_sub uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) _copy_to_user uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_to_user_mode uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) syscall_exit_work uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) perf_syscall_exit uname 30805 [000] 24001.058784838: ([kernel.kallsyms]) debug_smp_processor_id uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_alloc uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_get_recursion_context uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) debug_smp_processor_id uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_tp_event uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_trace_buf_update uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) tracing_gen_ctx_irq_test uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_swevent_event uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __perf_event_account_interrupt uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __this_cpu_preempt_check uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_output_forward uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) perf_event_aux_pause uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) ring_buffer_get uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_lock uname 30805 [000] 24001.058785046: ([kernel.kallsyms]) __rcu_read_unlock uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) pt_event_stop uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) debug_smp_processor_id uname 30805 [000] 24001.058785254: ([kernel.kallsyms]) native_write_msr uname 30805 [000] 24001.058785463: ([kernel.kallsyms]) native_write_msr uname 30805 [000] 24001.058785639: 0x0 Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Acked-by: James Clark Link: https://lkml.kernel.org/r/20241022155920.17511-3-adrian.hunter@intel.com --- include/linux/perf_event.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fb908843f209..91b310052a7c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -170,6 +170,12 @@ struct hw_perf_event { }; struct { /* aux / Intel-PT */ u64 aux_config; + /* + * For AUX area events, aux_paused cannot be a state + * flag because it can be updated asynchronously to + * state. + */ + unsigned int aux_paused; }; struct { /* software */ struct hrtimer hrtimer; @@ -294,6 +300,7 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 +#define PERF_PMU_CAP_AUX_PAUSE 0x0200 /** * pmu::scope @@ -384,6 +391,8 @@ struct pmu { #define PERF_EF_START 0x01 /* start the counter when adding */ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ +#define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ +#define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ /* * Adds/Removes a counter to/from the PMU, can be done inside a @@ -423,6 +432,18 @@ struct pmu { * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. + * + * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not + * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with + * PERF_EF_RESUME. + * + * ->start() with PERF_EF_RESUME will start as simply as possible but + * only if the counter is not otherwise stopped. Will not overlap + * another ->start() with PERF_EF_RESUME nor ->stop() with + * PERF_EF_PAUSE. + * + * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other + * ->stop()/->start() invocations, just not itself. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); @@ -1679,6 +1700,13 @@ static inline bool has_aux(struct perf_event *event) return event->pmu->setup_aux; } +static inline bool has_aux_action(struct perf_event *event) +{ + return event->attr.aux_sample_size || + event->attr.aux_pause || + event->attr.aux_resume; +} + static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; -- cgit v1.2.3 From 95e2eaf5b91aae6c3a433cd7882733bd806fa3c8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 4 Nov 2024 09:40:30 +0800 Subject: iommu/vt-d: Remove unused dmar_msi_read dmar_msi_read() has been unused since 2022 in commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture") Remove it. (dmar_msi_write still exists and is used once). Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20241022002702.302728-1-linux@treblig.org Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/dmar.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 499bb2c63483..692b2b445761 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -292,7 +292,6 @@ static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src) struct irq_data; extern void dmar_msi_unmask(struct irq_data *data); extern void dmar_msi_mask(struct irq_data *data); -extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); extern irqreturn_t dmar_fault(int irq, void *dev_id); -- cgit v1.2.3 From bebf29b18f34620e25f7e2bd9e4e4d8e34a8977d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 3 Nov 2024 17:03:31 +0000 Subject: sysfs: introduce callback attribute_group::bin_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several drivers need to dynamically calculate the size of an binary attribute. Currently this is done by assigning attr->size from the is_bin_visible() callback. This has drawbacks: * It is not documented. * A single attribute can be instantiated multiple times, overwriting the shared size field. * It prevents the structure to be moved to read-only memory. Introduce a new dedicated callback to calculate the size of the attribute. Signed-off-by: Thomas Weißschuh Acked-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-2-71110628844c@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index c4e64dc11206..4746cccb9589 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -87,6 +87,11 @@ do { \ * SYSFS_GROUP_VISIBLE() when assigning this callback to * specify separate _group_visible() and _attr_visible() * handlers. + * @bin_size: + * Optional: Function to return the size of a binary attribute + * of the group. Will be called repeatedly for each binary + * attribute in the group. Overwrites the size field embedded + * inside the attribute itself. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. @@ -97,6 +102,9 @@ struct attribute_group { struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, struct bin_attribute *, int); + size_t (*bin_size)(struct kobject *, + const struct bin_attribute *, + int); struct attribute **attrs; struct bin_attribute **bin_attrs; }; -- cgit v1.2.3 From b626816fdd7f9beb841856ba049396cff46e99aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 3 Nov 2024 17:03:34 +0000 Subject: sysfs: treewide: constify attribute callback of bin_is_visible() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The is_bin_visible() callbacks should not modify the struct bin_attribute passed as argument. Enforce this by marking the argument as const. As there are not many callback implementers perform this change throughout the tree at once. Signed-off-by: Thomas Weißschuh Acked-by: Martin K. Petersen Acked-by: Jason Gunthorpe Acked-by: Ira Weiny Acked-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-5-71110628844c@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 4746cccb9589..d1b22d56198b 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -101,7 +101,7 @@ struct attribute_group { umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, - struct bin_attribute *, int); + const struct bin_attribute *, int); size_t (*bin_size)(struct kobject *, const struct bin_attribute *, int); @@ -199,22 +199,22 @@ struct attribute_group { * attributes, the group visibility is determined by the function * specified to is_visible() not is_bin_visible() */ -#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ - static inline umode_t sysfs_group_visible_##name( \ - struct kobject *kobj, struct bin_attribute *attr, int n) \ - { \ - if (n == 0 && !name##_group_visible(kobj)) \ - return SYSFS_GROUP_INVISIBLE; \ - return name##_attr_visible(kobj, attr, n); \ +#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, const struct bin_attribute *attr, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return name##_attr_visible(kobj, attr, n); \ } -#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name) \ - static inline umode_t sysfs_group_visible_##name( \ - struct kobject *kobj, struct bin_attribute *a, int n) \ - { \ - if (n == 0 && !name##_group_visible(kobj)) \ - return SYSFS_GROUP_INVISIBLE; \ - return a->mode; \ +#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, const struct bin_attribute *a, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return a->mode; \ } #define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn -- cgit v1.2.3 From 94a20fb9af16417ab5fd17bcde3d906926f15ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 3 Nov 2024 17:03:35 +0000 Subject: sysfs: treewide: constify attribute callback of bin_attribute::mmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mmap() callbacks should not modify the struct bin_attribute passed as argument. Enforce this by marking the argument as const. As there are not many callback implementers perform this change throughout the tree at once. Signed-off-by: Thomas Weißschuh Acked-by: Andrew Donnellan # ocxl Acked-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-6-71110628844c@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index d1b22d56198b..9fcdc8cd3118 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -309,7 +309,7 @@ struct bin_attribute { char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *, loff_t, int); - int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr, + int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); }; -- cgit v1.2.3 From 699e7b85afb5d94b99b0a3edca7e9e93ea320c8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 3 Nov 2024 17:03:36 +0000 Subject: sysfs: treewide: constify attribute callback of bin_attribute::llseek() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The llseek() callbacks should not modify the struct bin_attribute passed as argument. Enforce this by marking the argument as const. As there are not many callback implementers perform this change throughout the tree at once. Signed-off-by: Thomas Weißschuh Acked-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-7-71110628844c@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 9fcdc8cd3118..cb2a5e277c23 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -307,7 +307,7 @@ struct bin_attribute { char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); - loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *, + loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); -- cgit v1.2.3 From ae587a509903cca138e910445d8c21fe73b45c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 3 Nov 2024 17:03:37 +0000 Subject: sysfs: implement all BIN_ATTR_* macros in terms of __BIN_ATTR() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The preparations for the upcoming constification of struct bin_attribute requires some logic in the structure definition macros. To avoid duplication of that logic in multiple macros, reimplement all other macros in terms of __BIN_ATTR(). Signed-off-by: Thomas Weißschuh Acked-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-8-71110628844c@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index cb2a5e277c23..d17c473c1ef2 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -333,17 +333,11 @@ struct bin_attribute { .size = _size, \ } -#define __BIN_ATTR_RO(_name, _size) { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .read = _name##_read, \ - .size = _size, \ -} +#define __BIN_ATTR_RO(_name, _size) \ + __BIN_ATTR(_name, 0444, _name##_read, NULL, _size) -#define __BIN_ATTR_WO(_name, _size) { \ - .attr = { .name = __stringify(_name), .mode = 0200 }, \ - .write = _name##_write, \ - .size = _size, \ -} +#define __BIN_ATTR_WO(_name, _size) \ + __BIN_ATTR(_name, 0200, NULL, _name##_write, _size) #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) @@ -364,11 +358,8 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) -#define __BIN_ATTR_ADMIN_RO(_name, _size) { \ - .attr = { .name = __stringify(_name), .mode = 0400 }, \ - .read = _name##_read, \ - .size = _size, \ -} +#define __BIN_ATTR_ADMIN_RO(_name, _size) \ + __BIN_ATTR(_name, 0400, _name##_read, NULL, _size) #define __BIN_ATTR_ADMIN_RW(_name, _size) \ __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) @@ -379,10 +370,8 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) #define BIN_ATTR_ADMIN_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) -#define __BIN_ATTR_SIMPLE_RO(_name, _mode) { \ - .attr = { .name = __stringify(_name), .mode = _mode }, \ - .read = sysfs_bin_attr_simple_read, \ -} +#define __BIN_ATTR_SIMPLE_RO(_name, _mode) \ + __BIN_ATTR(_name, _mode, sysfs_bin_attr_simple_read, NULL, 0) #define BIN_ATTR_SIMPLE_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444) -- cgit v1.2.3 From eb2e6c3a8d66ff37b2ee26cd32334ae0e05fd596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 3 Nov 2024 17:03:38 +0000 Subject: sysfs: bin_attribute: add const read/write callback variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make it possible to put struct bin_attribute into read-only memory, the sysfs core has to stop passing mutable pointers to the read() and write() callbacks. As there are numerous implementors of these callbacks throughout the tree it's not possible to change all of them at once. To enable a step-by-step transition, add new variants of the read() and write() callbacks which differ only in the constness of the struct bin_attribute argument. As most binary attributes are defined through macros, extend these macros to transparently handle both variants of callbacks to minimize the churn during the transition. As soon as all handlers are switch to the const variant, the non-const one can be removed together with the transition machinery. Signed-off-by: Thomas Weißschuh Acked-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20241103-sysfs-const-bin_attr-v2-9-71110628844c@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index d17c473c1ef2..d713a6445a62 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -305,8 +305,12 @@ struct bin_attribute { struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); + ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, + char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); + ssize_t (*write_new)(struct file *, struct kobject *, + const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, @@ -325,11 +329,28 @@ struct bin_attribute { */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) +typedef ssize_t __sysfs_bin_rw_handler_new(struct file *, struct kobject *, + const struct bin_attribute *, char *, loff_t, size_t); + /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ - .read = _read, \ - .write = _write, \ + .read = _Generic(_read, \ + __sysfs_bin_rw_handler_new * : NULL, \ + default : _read \ + ), \ + .read_new = _Generic(_read, \ + __sysfs_bin_rw_handler_new * : _read, \ + default : NULL \ + ), \ + .write = _Generic(_write, \ + __sysfs_bin_rw_handler_new * : NULL, \ + default : _write \ + ), \ + .write_new = _Generic(_write, \ + __sysfs_bin_rw_handler_new * : _write, \ + default : NULL \ + ), \ .size = _size, \ } -- cgit v1.2.3 From 64c58d7c99343a910edf995e15d8037e19ec5777 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Nov 2024 20:19:16 -0800 Subject: iomap: add a merge boundary flag File systems might have boundaries over which merges aren't possible. In fact these are very common, although most of the time some kind of header at the beginning of this region (e.g. XFS alloation groups, ext4 block groups) automatically create a merge barrier. But if that is not present, say for a device purely used for data we need to manually communicate that to iomap. Add a IOMAP_F_BOUNDARY flag to never merge I/O into a previous mapping. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f61407e3b121..9ecb8ea7714c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -53,6 +53,9 @@ struct vm_fault; * * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent * rather than a file data extent. + * + * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must + * never be merged with the mapping before it. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -64,6 +67,7 @@ struct vm_fault; #define IOMAP_F_BUFFER_HEAD 0 #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) +#define IOMAP_F_BOUNDARY (1U << 6) /* * Flags set by the core iomap code during operations: -- cgit v1.2.3 From 57a063632df8db6cb20d64ee52a06d4e2049235a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 23 Oct 2024 11:45:24 -0700 Subject: Input: introduce notion of passive observers for input handlers Sometimes it is useful to observe (and maybe modify) data coming from an input device, but only do that if there are other users of such input device. An example is touchpad switching functionality on Lenovo IdeaPad Z570 where it is desirable to suppress events coming from the touchpad if user toggles touchpad on/off button (on this laptop the firmware does not stop the device). Introduce notion of passive observers for input handlers to solve this issue. An input handler marked as passive observer behaves exactly like any other input handler or filter, but with one exception: it does not open/start underlying input device when attaching to it. Link: https://lore.kernel.org/r/ZxlEROX7bMo5cbZP@google.com Signed-off-by: Dmitry Torokhov --- include/linux/input.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index 89a0be6ee0e2..6437c35f0796 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -286,6 +286,10 @@ struct input_handle; * @start: starts handler for given handle. This function is called by * input core right after connect() method and also when a process * that "grabbed" a device releases it + * @passive_observer: set to %true by drivers only interested in observing + * data stream from devices if there are other users present. Such + * drivers will not result in starting underlying hardware device + * when input_open_device() is called for their handles * @legacy_minors: set to %true by drivers using legacy minor ranges * @minor: beginning of range of 32 legacy minors for devices this driver * can provide @@ -321,6 +325,7 @@ struct input_handler { void (*disconnect)(struct input_handle *handle); void (*start)(struct input_handle *handle); + bool passive_observer; bool legacy_minors; int minor; const char *name; -- cgit v1.2.3 From d2d243df445a88c26e91eac02b041213c7a32e9e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sun, 22 Sep 2024 12:32:13 +0800 Subject: mm: shmem: fix khugepaged activation policy for shmem Shmem has a separate interface (different from anonymous pages) to control huge page allocation, that means shmem THP can be enabled while anonymous THP is disabled. However, in this case, khugepaged will not start to collapse shmem THP, which is unreasonable. To fix this issue, we should call start_stop_khugepaged() to activate or deactivate the khugepaged thread when setting shmem mTHP interfaces. Moreover, add a new helper shmem_hpage_pmd_enabled() to help to check whether shmem THP is enabled, which will determine if khugepaged should be activated. Link: https://lkml.kernel.org/r/9b9c6cbc4499bf44c6455367fd9e0f6036525680.1726978977.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reported-by: Ryan Roberts Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 515a9a6a3c6f..ee6635052383 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -114,6 +114,7 @@ int shmem_unuse(unsigned int type); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force); +bool shmem_hpage_pmd_enabled(void); #else static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, @@ -121,6 +122,11 @@ static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, { return 0; } + +static inline bool shmem_hpage_pmd_enabled(void) +{ + return false; +} #endif #ifdef CONFIG_SHMEM -- cgit v1.2.3 From 9e9e085effe9b7e342138fde3cf8577d22509932 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Sat, 27 Jul 2024 00:52:46 +0800 Subject: mm/vmalloc: combine all TLB flush operations of KASAN shadow virtual address into one operation When compiling kernel source 'make -j $(nproc)' with the up-and-running KASAN-enabled kernel on a 256-core machine, the following soft lockup is shown: watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760] CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95 Workqueue: events drain_vmap_area_work RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0 Code: 38 c8 7c 08 84 c9 0f 85 49 08 00 00 8b 45 08 a8 01 74 2e 48 89 f1 49 89 f7 48 c1 e9 03 41 83 e7 07 4c 01 e9 41 83 c7 03 f3 90 <0f> b6 01 41 38 c7 7c 08 84 c0 0f 85 d4 06 00 00 8b 45 08 a8 01 75 RSP: 0018:ffffc9000cb3fb60 EFLAGS: 00000202 RAX: 0000000000000011 RBX: ffff8883bc4469c0 RCX: ffffed10776e9949 RDX: 0000000000000002 RSI: ffff8883bb74ca48 RDI: ffffffff8434dc50 RBP: ffff8883bb74ca40 R08: ffff888103585dc0 R09: ffff8884533a1800 R10: 0000000000000004 R11: ffffffffffffffff R12: ffffed1077888d39 R13: dffffc0000000000 R14: ffffed1077888d38 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff8883bc400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005577b5c8d158 CR3: 0000000004850000 CR4: 0000000000350ef0 Call Trace: ? watchdog_timer_fn+0x2cd/0x390 ? __pfx_watchdog_timer_fn+0x10/0x10 ? __hrtimer_run_queues+0x300/0x6d0 ? sched_clock_cpu+0x69/0x4e0 ? __pfx___hrtimer_run_queues+0x10/0x10 ? srso_return_thunk+0x5/0x5f ? ktime_get_update_offsets_now+0x7f/0x2a0 ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? hrtimer_interrupt+0x2ca/0x760 ? __sysvec_apic_timer_interrupt+0x8c/0x2b0 ? sysvec_apic_timer_interrupt+0x6a/0x90 ? asm_sysvec_apic_timer_interrupt+0x16/0x20 ? smp_call_function_many_cond+0x1d8/0xbb0 ? __pfx_do_kernel_range_flush+0x10/0x10 on_each_cpu_cond_mask+0x20/0x40 flush_tlb_kernel_range+0x19b/0x250 ? srso_return_thunk+0x5/0x5f ? kasan_release_vmalloc+0xa7/0xc0 purge_vmap_node+0x357/0x820 ? __pfx_purge_vmap_node+0x10/0x10 __purge_vmap_area_lazy+0x5b8/0xa10 drain_vmap_area_work+0x21/0x30 process_one_work+0x661/0x10b0 worker_thread+0x844/0x10e0 ? srso_return_thunk+0x5/0x5f ? __kthread_parkme+0x82/0x140 ? __pfx_worker_thread+0x10/0x10 kthread+0x2a5/0x370 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x30/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Debugging Analysis: 1. The following ftrace log shows that the lockup CPU spends too much time iterating vmap_nodes and flushing TLB when purging vm_area structures. (Some info is trimmed). kworker: funcgraph_entry: | drain_vmap_area_work() { kworker: funcgraph_entry: | mutex_lock() { kworker: funcgraph_entry: 1.092 us | __cond_resched(); kworker: funcgraph_exit: 3.306 us | } ... ... kworker: funcgraph_entry: | flush_tlb_kernel_range() { ... ... kworker: funcgraph_exit: # 7533.649 us | } ... ... kworker: funcgraph_entry: 2.344 us | mutex_unlock(); kworker: funcgraph_exit: $ 23871554 us | } The drain_vmap_area_work() spends over 23 seconds. There are 2805 flush_tlb_kernel_range() calls in the ftrace log. * One is called in __purge_vmap_area_lazy(). * Others are called by purge_vmap_node->kasan_release_vmalloc. purge_vmap_node() iteratively releases kasan vmalloc allocations and flushes TLB for each vmap_area. - [Rough calculation] Each flush_tlb_kernel_range() runs about 7.5ms. -- 2804 * 7.5ms = 21.03 seconds. -- That's why a soft lock is triggered. 2. Extending the soft lockup time can work around the issue (For example, # echo 60 > /proc/sys/kernel/watchdog_thresh). This confirms the above-mentioned speculation: drain_vmap_area_work() spends too much time. If we combine all TLB flush operations of the KASAN shadow virtual address into one operation in the call path 'purge_vmap_node()->kasan_release_vmalloc()', the running time of drain_vmap_area_work() can be saved greatly. The idea is from the flush_tlb_kernel_range() call in __purge_vmap_area_lazy(). And, the soft lockup won't be triggered. Here is the test result based on 6.10: [6.10 wo/ the patch] 1. ftrace latency profiling (record a trace if the latency > 20s). echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function echo function_graph > /sys/kernel/debug/tracing/current_tracer echo 1 > /sys/kernel/debug/tracing/tracing_on 2. Run `make -j $(nproc)` to compile the kernel source 3. Once the soft lockup is reproduced, check the ftrace log: cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 76) $ 50412985 us | } /* __purge_vmap_area_lazy */ 76) $ 50412997 us | } /* drain_vmap_area_work */ 76) $ 29165911 us | } /* __purge_vmap_area_lazy */ 76) $ 29165926 us | } /* drain_vmap_area_work */ 91) $ 53629423 us | } /* __purge_vmap_area_lazy */ 91) $ 53629434 us | } /* drain_vmap_area_work */ 91) $ 28121014 us | } /* __purge_vmap_area_lazy */ 91) $ 28121026 us | } /* drain_vmap_area_work */ [6.10 w/ the patch] 1. Repeat step 1-2 in "[6.10 wo/ the patch]" 2. The soft lockup is not triggered and ftrace log is empty. cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace log. 4. Setting 'tracing_thresh' to 1 second gets ftrace log. cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 23) $ 1074942 us | } /* __purge_vmap_area_lazy */ 23) $ 1074950 us | } /* drain_vmap_area_work */ The worst execution time of drain_vmap_area_work() is about 1 second. Link: https://lore.kernel.org/lkml/ZqFlawuVnOMY2k3E@pc638.lan/ Link: https://lkml.kernel.org/r/20240726165246.31326-1-ahuang12@lenovo.com Fixes: 282631cb2447 ("mm: vmalloc: remove global purge_vmap_area_root rb-tree") Signed-off-by: Adrian Huang Co-developed-by: Uladzislau Rezki (Sony) Signed-off-by: Uladzislau Rezki (Sony) Tested-by: Jiwei Sun Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christoph Hellwig Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- include/linux/kasan.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 00a3bf7c0d8f..6bbfc8aa42e8 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -29,6 +29,9 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t; #define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) #define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */ +#define KASAN_VMALLOC_TLB_FLUSH 0x2 /* TLB flush */ + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include @@ -564,7 +567,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end); + unsigned long free_region_end, + unsigned long flags); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ @@ -579,7 +583,8 @@ static inline int kasan_populate_vmalloc(unsigned long start, static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) { } + unsigned long free_region_end, + unsigned long flags) { } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ @@ -614,7 +619,8 @@ static inline int kasan_populate_vmalloc(unsigned long start, static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) { } + unsigned long free_region_end, + unsigned long flags) { } static inline void *kasan_unpoison_vmalloc(const void *start, unsigned long size, -- cgit v1.2.3 From 1cd1a4e71b61eaf8cadd15372b67ccd60a2e1a99 Mon Sep 17 00:00:00 2001 From: Tanya Agarwal Date: Fri, 27 Sep 2024 00:05:16 +0530 Subject: mm/mempolicy: fix comments for better documentation Fix typo in mempolicy.h and Correct the number of allowed memory policy Link: https://lkml.kernel.org/r/20240926183516.4034-2-tanyaagarwal25699@gmail.com Signed-off-by: Tanya Agarwal Reviewed-by: Shuah Khan Cc: Anup Sharma Signed-off-by: Andrew Morton --- include/linux/mempolicy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 1add16f21612..ce9885e0178a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -47,7 +47,7 @@ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ - nodemask_t nodes; /* interleave/bind/perfer */ + nodemask_t nodes; /* interleave/bind/preferred/etc */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { -- cgit v1.2.3 From f2f484085ef1a2bb5aea861a06bc6b4dc50d2ab8 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Thu, 26 Sep 2024 15:49:22 +0800 Subject: mm: move mm flags to mm_types.h The types of mm flags are now far beyond the core dump related features. This patch moves mm flags from linux/sched/coredump.h to linux/mm_types.h. The linux/sched/coredump.h has include the mm_types.h, so the C files related to coredump does not need to change head file inclusion. In addition, the inclusion of sched/coredump.h now can be deleted from the C files that irrelevant to core dump. Link: https://lkml.kernel.org/r/20240926074922.2721274-1-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Cc: Kefeng Wang Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 - include/linux/khugepaged.h | 2 - include/linux/ksm.h | 1 - include/linux/mm_types.h | 84 ++++++++++++++++++++++++++++++++++++++++++ include/linux/oom.h | 1 - include/linux/sched/coredump.h | 82 ----------------------------------------- 6 files changed, 84 insertions(+), 87 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ef5b80e48599..8afe09a2cf03 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -2,7 +2,6 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H -#include #include #include /* only for vma_is_dax() */ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 30baae91b225..1f46046080f5 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -2,8 +2,6 @@ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H -#include /* MMF_VM_HUGEPAGE */ - extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; diff --git a/include/linux/ksm.h b/include/linux/ksm.h index ec9c05044d4f..29022e71a074 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -13,7 +13,6 @@ #include #include #include -#include #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..ff8627acbaa7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1499,4 +1499,88 @@ enum { /* See also internal only FOLL flags in mm/internal.h */ }; +/* mm flags */ + +/* + * The first two bits represent core dump modes for set-user-ID, + * the modes are SUID_DUMP_* defined in linux/sched/coredump.h + */ +#define MMF_DUMPABLE_BITS 2 +#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +/* coredump filter bits */ +#define MMF_DUMP_ANON_PRIVATE 2 +#define MMF_DUMP_ANON_SHARED 3 +#define MMF_DUMP_MAPPED_PRIVATE 4 +#define MMF_DUMP_MAPPED_SHARED 5 +#define MMF_DUMP_ELF_HEADERS 6 +#define MMF_DUMP_HUGETLB_PRIVATE 7 +#define MMF_DUMP_HUGETLB_SHARED 8 +#define MMF_DUMP_DAX_PRIVATE 9 +#define MMF_DUMP_DAX_SHARED 10 + +#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS +#define MMF_DUMP_FILTER_BITS 9 +#define MMF_DUMP_FILTER_MASK \ + (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) +#define MMF_DUMP_FILTER_DEFAULT \ + ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ + (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS +# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +#else +# define MMF_DUMP_MASK_DEFAULT_ELF 0 +#endif + /* leave room for more dump flags */ +#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ + +/* + * This one-shot flag is dropped due to necessity of changing exe once again + * on NFS restore + */ +//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ + +#define MMF_HAS_UPROBES 19 /* has uprobes */ +#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ +#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ +#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ +#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ +/* + * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either + * replaced in the future by mm.pinned_vm when it becomes stable, or grow into + * a counter on its own. We're aggresive on this bit for now: even if the + * pinned pages were unpinned later on, we'll still keep this bit set for the + * lifecycle of this mm, just for simplicity. + */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + + +#define MMF_HAS_MDWE_NO_INHERIT 29 + +#define MMF_VM_MERGE_ANY 30 +#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) + +#define MMF_TOPDOWN 31 /* mm searches top down by default */ +#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) + +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ + MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) + +static inline unsigned long mmf_init_flags(unsigned long flags) +{ + if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) + flags &= ~((1UL << MMF_HAS_MDWE) | + (1UL << MMF_HAS_MDWE_NO_INHERIT)); + return flags & MMF_INIT_MASK; +} + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 7d0c9c48a0c5..1e0fc6931ce9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -7,7 +7,6 @@ #include #include #include -#include /* MMF_* */ #include /* VM_FAULT* */ struct zonelist; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index e62ff805cfc9..6eb65ceed213 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,12 +8,6 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -/* mm flags */ - -/* for SUID_DUMP_* above */ -#define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) - extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -31,80 +25,4 @@ static inline int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -/* coredump filter bits */ -#define MMF_DUMP_ANON_PRIVATE 2 -#define MMF_DUMP_ANON_SHARED 3 -#define MMF_DUMP_MAPPED_PRIVATE 4 -#define MMF_DUMP_MAPPED_SHARED 5 -#define MMF_DUMP_ELF_HEADERS 6 -#define MMF_DUMP_HUGETLB_PRIVATE 7 -#define MMF_DUMP_HUGETLB_SHARED 8 -#define MMF_DUMP_DAX_PRIVATE 9 -#define MMF_DUMP_DAX_SHARED 10 - -#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 9 -#define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) -#define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) - -#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) -#else -# define MMF_DUMP_MASK_DEFAULT_ELF 0 -#endif - /* leave room for more dump flags */ -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ -#define MMF_VM_HUGEPAGE 17 /* set when mm is available for - khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ - -#define MMF_HAS_UPROBES 19 /* has uprobes */ -#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ -#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ -#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ -/* - * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either - * replaced in the future by mm.pinned_vm when it becomes stable, or grow into - * a counter on its own. We're aggresive on this bit for now: even if the - * pinned pages were unpinned later on, we'll still keep this bit set for the - * lifecycle of this mm, just for simplicity. - */ -#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ - -#define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - - -#define MMF_HAS_MDWE_NO_INHERIT 29 - -#define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) - -#define MMF_TOPDOWN 31 /* mm searches top down by default */ -#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) - -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ - MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) - -static inline unsigned long mmf_init_flags(unsigned long flags) -{ - if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) - flags &= ~((1UL << MMF_HAS_MDWE) | - (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; -} - #endif /* _LINUX_SCHED_COREDUMP_H */ -- cgit v1.2.3 From 66efef9b1a7d6cc725efa9395fb390483ad5b555 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:14 +0800 Subject: mm: pgtable: introduce pte_offset_map_{ro|rw}_nolock() Patch series "introduce pte_offset_map_{ro|rw}_nolock()", v5. As proposed by David Hildenbrand [1], this series introduces the following two new helper functions to replace pte_offset_map_nolock(). 1. pte_offset_map_ro_nolock() 2. pte_offset_map_rw_nolock() As the name suggests, pte_offset_map_ro_nolock() is used for read-only case. In this case, only read-only operations will be performed on PTE page after the PTL is held. The RCU lock in pte_offset_map_nolock() will ensure that the PTE page will not be freed, and there is no need to worry about whether the pmd entry is modified. Therefore pte_offset_map_ro_nolock() is just a renamed version of pte_offset_map_nolock(). pte_offset_map_rw_nolock() is used for may-write case. In this case, the pte or pmd entry may be modified after the PTL is held, so we need to ensure that the pmd entry has not been modified concurrently. So in addition to the name change, it also outputs the pmdval when successful. The users should make sure the page table is stable like checking pte_same() or checking pmd_same() by using the output pmdval before performing the write operations. This series will convert all pte_offset_map_nolock() into the above two helper functions one by one, and finally completely delete it. This also a preparation for reclaiming the empty user PTE page table pages. This patch (of 13): Currently, the usage of pte_offset_map_nolock() can be divided into the following two cases: 1) After acquiring PTL, only read-only operations are performed on the PTE page. In this case, the RCU lock in pte_offset_map_nolock() will ensure that the PTE page will not be freed, and there is no need to worry about whether the pmd entry is modified. 2) After acquiring PTL, the pte or pmd entries may be modified. At this time, we need to ensure that the pmd entry has not been modified concurrently. To more clearing distinguish between these two cases, this commit introduces two new helper functions to replace pte_offset_map_nolock(). For 1), just rename it to pte_offset_map_ro_nolock(). For 2), in addition to changing the name to pte_offset_map_rw_nolock(), it also outputs the pmdval when successful. It is applicable for may-write cases where any modification operations to the page table may happen after the corresponding spinlock is held afterwards. But the users should make sure the page table is stable like checking pte_same() or checking pmd_same() by using the output pmdval before performing the write operations. Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will be read-only/read-write protected. Subsequent commits will convert pte_offset_map_nolock() into the above two functions one by one, and finally completely delete it. Link: https://lkml.kernel.org/r/cover.1727332572.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/5aeecfa131600a454b1f3a038a1a54282ca3b856.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 61fff5d34ed5..0cf45d4b7286 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3017,6 +3017,11 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ -- cgit v1.2.3 From 583e66debd1d5aa8c401aebe924c7406e15579a7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:26 +0800 Subject: mm: pgtable: remove pte_offset_map_nolock() Now no users are using the pte_offset_map_nolock(), remove it. Link: https://lkml.kernel.org/r/d04f9bbbcde048fb6ffa6f2bdbc6f9b22d5286f9.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0cf45d4b7286..8f5394d75ce2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3015,8 +3015,6 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, return pte; } -pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, -- cgit v1.2.3 From 473c371254d2c9906c286c939eaa99d0fac13e38 Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Thu, 26 Sep 2024 13:06:47 +0800 Subject: mm: migrate LRU_REFS_MASK bits in folio_migrate_flags Bits of LRU_REFS_MASK are not inherited during migration which lead to new folio start from tier0 when MGLRU enabled. Try to bring as much bits of folio->flags as possible since compaction and alloc_contig_range which introduce migration do happen at times. Link: https://lkml.kernel.org/r/20240926050647.5653-1-zhaoyang.huang@unisoc.com Signed-off-by: Zhaoyang Huang Suggested-by: Yu Zhao Acked-by: David Hildenbrand Cc: Matthew Wilcox Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f4fe593c1400..6f801c7b36e2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -291,6 +291,12 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return true; } +static inline void folio_migrate_refs(struct folio *new, struct folio *old) +{ + unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; + + set_mask_bits(&new->flags, LRU_REFS_MASK, refs); +} #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) @@ -313,6 +319,10 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return false; } +static inline void folio_migrate_refs(struct folio *new, struct folio *old) +{ + +} #endif /* CONFIG_LRU_GEN */ static __always_inline -- cgit v1.2.3 From 4cc0473d7754d387680bdf0728eb29f0ec8834bf Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:05 +0800 Subject: get rid of __get_task_comm() Patch series "Improve the copy of task comm", v8. Using {memcpy,strncpy,strcpy,kstrdup} to copy the task comm relies on the length of task comm. Changes in the task comm could result in a destination string that is overflow. Therefore, we should explicitly ensure the destination string is always NUL-terminated, regardless of the task comm. This approach will facilitate future extensions to the task comm. As suggested by Linus [0], we can identify all relevant code with the following git grep command: git grep 'memcpy.*->comm\>' git grep 'kstrdup.*->comm\>' git grep 'strncpy.*->comm\>' git grep 'strcpy.*->comm\>' PATCH #2~#4: memcpy PATCH #5~#6: kstrdup PATCH #7: strcpy Please note that strncpy() is not included in this series as it is being tracked by another effort. [1] This patch (of 7): We want to eliminate the use of __get_task_comm() for the following reasons: - The task_lock() is unnecessary Quoted from Linus [0]: : Since user space can randomly change their names anyway, using locking : was always wrong for readers (for writers it probably does make sense : to have some lock - although practically speaking nobody cares there : either, but at least for a writer some kind of race could have : long-term mixed results Link: https://lkml.kernel.org/r/20241007144911.27693-1-laoar.shao@gmail.com Link: https://lkml.kernel.org/r/20241007144911.27693-2-laoar.shao@gmail.com Link: https://lore.kernel.org/all/CAHk-=wivfrF0_zvf+oj6==Sh=-npJooP8chLPEfaFV0oNYTTBA@mail.gmail.com [0] Link: https://lore.kernel.org/all/CAHk-=whWtUC-AjmGJveAETKOMeMFSTwKwu99v7+b6AyHMmaDFA@mail.gmail.com/ Link: https://lore.kernel.org/all/CAHk-=wjAmmHUg6vho1KjzQi2=psR30+CogFd4aXrThr2gsiS4g@mail.gmail.com/ [0] Link: https://github.com/KSPP/linux/issues/90 [1] Signed-off-by: Yafang Shao Suggested-by: Linus Torvalds Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Eric Biederman Cc: Kees Cook Cc: Alexei Starovoitov Cc: Matus Jokay Cc: Alejandro Colomar Cc: "Serge E. Hallyn" Cc: Catalin Marinas Cc: Justin Stitt Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Andy Shevchenko Cc: Daniel Vetter Cc: David Airlie Cc: Eric Paris Cc: James Morris Cc: Maarten Lankhorst Cc: Matthew Wilcox Cc: Maxime Ripard Cc: Ondrej Mosnacek Cc: Paul Moore Cc: Quentin Monnet Cc: Simon Horman Cc: Stephen Smalley Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- include/linux/sched.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index bb343136ddd0..67718d5591dd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1121,9 +1121,12 @@ struct task_struct { /* * executable name, excluding path. * - * - normally initialized setup_new_exec() - * - access it with [gs]et_task_comm() - * - lock it with task_lock() + * - normally initialized begin_new_exec() + * - set it with set_task_comm() + * - strscpy_pad() to ensure it is always NUL-terminated and + * zero-padded + * - task_lock() to ensure the operation is atomic and the name is + * fully updated. */ char comm[TASK_COMM_LEN]; @@ -1938,10 +1941,23 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from) __set_task_comm(tsk, from, false); } -extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +/* + * - Why not use task_lock()? + * User space can randomly change their names anyway, so locking for readers + * doesn't make sense. For writers, locking is probably necessary, as a race + * condition could lead to long-term mixed results. + * The strscpy_pad() in __set_task_comm() can ensure that the task comm is + * always NUL-terminated and zero-padded. Therefore the race condition between + * reader and writer is not an issue. + * + * - BUILD_BUG_ON() can help prevent the buf from being truncated. + * Since the callers don't perform any return value checks, this safeguard is + * necessary. + */ #define get_task_comm(buf, tsk) ({ \ - BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ - __get_task_comm(buf, sizeof(buf), tsk); \ + BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ + strscpy_pad(buf, (tsk)->comm); \ + buf; \ }) #ifdef CONFIG_SMP -- cgit v1.2.3 From f2fa0fd4e7db8326a77618962714924b64f5f889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 12 Oct 2024 19:52:53 +0200 Subject: reboot: move reboot_notifier_list to kernel/reboot.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All the functions related to the reboot notifier list are in kernel/reboot.c. Move the list itself, too. As there are no direct users anymore, make the declaration static. Link: https://lkml.kernel.org/r/20241012-reboot_notifier_list-v1-1-6093bb9455ce@weissschuh.net Signed-off-by: Thomas Weißschuh Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- include/linux/notifier.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 45702bdcbceb..b42e64734968 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -237,7 +237,5 @@ static inline int notifier_to_errno(int ret) #define KBD_KEYSYM 0x0004 /* Keyboard keysym */ #define KBD_POST_KEYSYM 0x0005 /* Called after keyboard keysym interpretation */ -extern struct blocking_notifier_head reboot_notifier_list; - #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ -- cgit v1.2.3 From a9d38bcd7337f051912174ebfc500e1cef73982e Mon Sep 17 00:00:00 2001 From: Sui Jingfeng Date: Sat, 12 Oct 2024 18:08:17 +0800 Subject: scatterlist: fix a typo Replace the 'One' with 'On'. Link: https://lkml.kernel.org/r/20241012100817.323007-1-sui.jingfeng@linux.dev Fixes: af2880ec4402 ("scatterlist: add dedicated config for DMA flags") Signed-off-by: Sui Jingfeng Reviewed-by: Petr Tesarik Cc: Catalin Marinas Cc: Michael Kelley Cc: Robin Murphy Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index e61d164622db..c5e2239b550e 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -273,7 +273,7 @@ static inline void sg_unmark_end(struct scatterlist *sg) } /* - * One 64-bit architectures there is a 4-byte padding in struct scatterlist + * On 64-bit architectures there is a 4-byte padding in struct scatterlist * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA * flags bits to indicate when a specific dma address is a bus address or the * buffer may have been bounced via SWIOTLB. -- cgit v1.2.3 From 74ef070e325465a1b364db6a5c6859785537f835 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 21 Oct 2024 10:07:36 +0200 Subject: percpu: merge VERIFY_PERCPU_PTR() into its only user Merge VERIFY_PERCPU_PTR() into non-CONFIG_SMP per_cpu_ptr() to make macro similar to CONFIG_SMP per_cpu_ptr(). This will allow a follow-up patch to refactor common code to a macro. No functional changes, non-CONFIG_SMP per_cpu_ptr() was the only user of VERIFY_PERCPU_PTR(). Link: https://lkml.kernel.org/r/20241021080856.48746-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 8efce7414fad..7fa88c5f4b26 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -254,13 +254,13 @@ do { \ #else /* CONFIG_SMP */ -#define VERIFY_PERCPU_PTR(__p) \ +#define per_cpu_ptr(ptr, cpu) \ ({ \ - __verify_pcpu_ptr(__p); \ - (typeof(*(__p)) __kernel __force *)(__p); \ + (void)(cpu); \ + __verify_pcpu_ptr(ptr); \ + (typeof(*(ptr)) __kernel __force *)(ptr); \ }) -#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); VERIFY_PERCPU_PTR(ptr); }) #define raw_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) #define this_cpu_ptr(ptr) raw_cpu_ptr(ptr) -- cgit v1.2.3 From 001217defda86d0d6a5a9e6cf77a6b813857e7e3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 21 Oct 2024 10:07:37 +0200 Subject: percpu: introduce PERCPU_PTR() macro Introduce PERCPU_PTR() macro to cast the percpu pointer from the percpu address space to a generic (kernel) address space. Use it in per_cpu_ptr() and related SHIFT_PERCPU_PTR() macros. Also remove common knowledge from SHIFT_PERCPU_PTR() comment, "weird cast" is just a standard way to inform sparse of a cast from the percpu address space to a generic address space. Link: https://lkml.kernel.org/r/20241021080856.48746-2-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 7fa88c5f4b26..e1cf7982424f 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -220,15 +220,17 @@ do { \ (void)__vpp_verify; \ } while (0) +#define PERCPU_PTR(__p) \ + (typeof(*(__p)) __force __kernel *)(__p); + #ifdef CONFIG_SMP /* - * Add an offset to a pointer but keep the pointer as-is. Use RELOC_HIDE() - * to prevent the compiler from making incorrect assumptions about the - * pointer value. The weird cast keeps both GCC and sparse happy. + * Add an offset to a pointer. Use RELOC_HIDE() to prevent the compiler + * from making incorrect assumptions about the pointer value. */ #define SHIFT_PERCPU_PTR(__p, __offset) \ - RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)) + RELOC_HIDE(PERCPU_PTR(__p), (__offset)) #define per_cpu_ptr(ptr, cpu) \ ({ \ @@ -258,7 +260,7 @@ do { \ ({ \ (void)(cpu); \ __verify_pcpu_ptr(ptr); \ - (typeof(*(ptr)) __kernel __force *)(ptr); \ + PERCPU_PTR(ptr); \ }) #define raw_cpu_ptr(ptr) per_cpu_ptr(ptr, 0) -- cgit v1.2.3 From dabddd687c9e1a06241d6b4d1f66b9f2b60b3ad1 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 21 Oct 2024 10:07:38 +0200 Subject: percpu: cast percpu pointer in PERCPU_PTR() via unsigned long Cast pointer from percpu address space to generic (kernel) address space in PERCPU_PTR() macro via unsigned long intermediate cast [1]. This intermediate cast is also required to avoid build failure when GCC's strict named address space checks for x86 targets [2] are enabled. Found by GCC's named address space checks. [1] https://sparse.docs.kernel.org/en/latest/annotations.html#address-space-name [2] https://gcc.gnu.org/onlinedocs/gcc/Named-Address-Spaces.html#x86-Named-Address-Spaces Link: https://lkml.kernel.org/r/20241021080856.48746-3-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index e1cf7982424f..35842d1e3879 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,7 +221,10 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ - (typeof(*(__p)) __force __kernel *)(__p); +({ \ + unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ + (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ +}) #ifdef CONFIG_SMP -- cgit v1.2.3 From 92a8b224b833e82d286d2100432adbac8cf8a2a1 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:51 +0800 Subject: lib/min_heap: introduce non-inline versions of min heap API functions Patch series "Enhance min heap API with non-inline functions and optimizations", v2. Add non-inline versions of the min heap API functions in lib/min_heap.c and updates all users outside of kernel/events/core.c to use these non-inline versions. To mitigate the performance impact of indirect function calls caused by the non-inline versions of the swap and compare functions, a builtin swap has been introduced that swaps elements based on their size. Additionally, it micro-optimizes the efficiency of the min heap by pre-scaling the counter, following the same approach as in lib/sort.c. Documentation for the min heap API has also been added to the core-api section. This patch (of 10): All current min heap API functions are marked with '__always_inline'. However, as the number of users increases, inlining these functions everywhere leads to a increase in kernel size. In performance-critical paths, such as when perf events are enabled and min heap functions are called on every context switch, it is important to retain the inline versions for optimal performance. To balance this, the original inline functions are kept, and additional non-inline versions of the functions have been added in lib/min_heap.c. Link: https://lkml.kernel.org/r/20241020040200.939973-1-visitorckw@gmail.com Link: https://lore.kernel.org/20240522161048.8d8bbc7b153b4ecd92c50666@linux-foundation.org Link: https://lkml.kernel.org/r/20241020040200.939973-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Andrew Morton Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: Kuan-Wei Chiu Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 129 +++++++++++++++++++++++++++++++---------------- 1 file changed, 86 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 43a7b9dcf15e..0abb21173979 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -40,7 +40,7 @@ struct min_heap_callbacks { /* Initialize a min-heap. */ static __always_inline -void __min_heap_init(min_heap_char *heap, void *data, int size) +void __min_heap_init_inline(min_heap_char *heap, void *data, int size) { heap->nr = 0; heap->size = size; @@ -50,33 +50,33 @@ void __min_heap_init(min_heap_char *heap, void *data, int size) heap->data = heap->preallocated; } -#define min_heap_init(_heap, _data, _size) \ - __min_heap_init((min_heap_char *)_heap, _data, _size) +#define min_heap_init_inline(_heap, _data, _size) \ + __min_heap_init_inline((min_heap_char *)_heap, _data, _size) /* Get the minimum element from the heap. */ static __always_inline -void *__min_heap_peek(struct min_heap_char *heap) +void *__min_heap_peek_inline(struct min_heap_char *heap) { return heap->nr ? heap->data : NULL; } -#define min_heap_peek(_heap) \ - (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +#define min_heap_peek_inline(_heap) \ + (__minheap_cast(_heap) __min_heap_peek_inline((min_heap_char *)_heap)) /* Check if the heap is full. */ static __always_inline -bool __min_heap_full(min_heap_char *heap) +bool __min_heap_full_inline(min_heap_char *heap) { return heap->nr == heap->size; } -#define min_heap_full(_heap) \ - __min_heap_full((min_heap_char *)_heap) +#define min_heap_full_inline(_heap) \ + __min_heap_full_inline((min_heap_char *)_heap) /* Sift the element at pos down the heap. */ static __always_inline -void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *left, *right; void *data = heap->data; @@ -108,13 +108,14 @@ void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, } } -#define min_heap_sift_down(_heap, _pos, _func, _args) \ - __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_down_inline(_heap, _pos, _func, _args) \ + __min_heap_sift_down_inline((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), \ + _func, _args) /* Sift up ith element from the heap, O(log2(nr)). */ static __always_inline -void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, - const struct min_heap_callbacks *func, void *args) +void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; size_t parent; @@ -128,27 +129,28 @@ void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, } } -#define min_heap_sift_up(_heap, _idx, _func, _args) \ - __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heap_sift_up_inline(_heap, _idx, _func, _args) \ + __min_heap_sift_up_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ + _func, _args) /* Floyd's approach to heapification that is O(nr). */ static __always_inline -void __min_heapify_all(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { int i; for (i = heap->nr / 2 - 1; i >= 0; i--) - __min_heap_sift_down(heap, i, elem_size, func, args); + __min_heap_sift_down_inline(heap, i, elem_size, func, args); } -#define min_heapify_all(_heap, _func, _args) \ - __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heapify_all_inline(_heap, _func, _args) \ + __min_heapify_all_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_pop(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -158,13 +160,13 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, /* Place last element at the root (position 0) and then sift down. */ heap->nr--; memcpy(data, data + (heap->nr * elem_size), elem_size); - __min_heap_sift_down(heap, 0, elem_size, func, args); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); return true; } -#define min_heap_pop(_heap, _func, _args) \ - __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_inline(_heap, _func, _args) \ + __min_heap_pop_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* * Remove the minimum element and then push the given element. The @@ -172,22 +174,21 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, * efficient than a pop followed by a push that does 2. */ static __always_inline -void __min_heap_pop_push(min_heap_char *heap, - const void *element, size_t elem_size, - const struct min_heap_callbacks *func, - void *args) +void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { memcpy(heap->data, element, elem_size); - __min_heap_sift_down(heap, 0, elem_size, func, args); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); } -#define min_heap_pop_push(_heap, _element, _func, _args) \ - __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push_inline(_heap, _element, _func, _args) \ + __min_heap_pop_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) /* Push an element on to the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; int pos; @@ -201,18 +202,19 @@ bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, heap->nr++; /* Sift child at pos up. */ - __min_heap_sift_up(heap, elem_size, pos, func, args); + __min_heap_sift_up_inline(heap, elem_size, pos, func, args); return true; } -#define min_heap_push(_heap, _element, _func, _args) \ - __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +#define min_heap_push_inline(_heap, _element, _func, _args) \ + __min_heap_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) /* Remove ith element from the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -224,12 +226,53 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, if (idx == heap->nr) return true; func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); - __min_heap_sift_up(heap, elem_size, idx, func, args); - __min_heap_sift_down(heap, idx, elem_size, func, args); + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); + __min_heap_sift_down_inline(heap, idx, elem_size, func, args); return true; } +#define min_heap_del_inline(_heap, _idx, _func, _args) \ + __min_heap_del_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ + _func, _args) + +void __min_heap_init(min_heap_char *heap, void *data, int size); +void *__min_heap_peek(struct min_heap_char *heap); +bool __min_heap_full(min_heap_char *heap); +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); + +#define min_heap_init(_heap, _data, _size) \ + __min_heap_init((min_heap_char *)_heap, _data, _size) +#define min_heap_peek(_heap) \ + (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +#define min_heap_full(_heap) \ + __min_heap_full((min_heap_char *)_heap) +#define min_heap_sift_down(_heap, _pos, _func, _args) \ + __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_up(_heap, _idx, _func, _args) \ + __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heapify_all(_heap, _func, _args) \ + __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop(_heap, _func, _args) \ + __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push(_heap, _element, _func, _args) \ + __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) +#define min_heap_push(_heap, _element, _func, _args) \ + __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) #define min_heap_del(_heap, _idx, _func, _args) \ __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) -- cgit v1.2.3 From aa5888afc2347ebb394c2c4b694fa3026775009e Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:52 +0800 Subject: lib min_heap: optimize min heap by prescaling counters for better performance Improve the efficiency of the min heap by prescaling counters, eliminating the need to repeatedly compute 'index * element_size' when accessing elements. By doing so, we avoid the overhead associated with recalculating the byte offset for each heap operation. However, with prescaling, the calculation for the parent element's location is no longer as simple as '(i - 1) / 2'. To address this, we copy the parent function from 'lib/sort.c', which calculates the parent offset in a branchless manner without using any division instructions. This optimization should result in a more efficient heap implementation by reducing the computational overhead of finding parent and child offsets. Link: https://lkml.kernel.org/r/20241020040200.939973-3-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 73 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 0abb21173979..41b092a14238 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -38,6 +38,32 @@ struct min_heap_callbacks { void (*swp)(void *lhs, void *rhs, void *args); }; +/** + * parent - given the offset of the child, find the offset of the parent. + * @i: the offset of the heap element whose parent is sought. Non-zero. + * @lsbit: a precomputed 1-bit mask, equal to "size & -size" + * @size: size of each element + * + * In terms of array indexes, the parent of element j = @i/@size is simply + * (j-1)/2. But when working in byte offsets, we can't use implicit + * truncation of integer divides. + * + * Fortunately, we only need one bit of the quotient, not the full divide. + * @size has a least significant bit. That bit will be clear if @i is + * an even multiple of @size, and set if it's an odd multiple. + * + * Logically, we're doing "if (i & lsbit) i -= size;", but since the + * branch is unpredictable, it's done with a bit of clever branch-free + * code instead. + */ +__attribute_const__ __always_inline +static size_t parent(size_t i, unsigned int lsbit, size_t size) +{ + i -= size; + i -= size & -(i & lsbit); + return i / 2; +} + /* Initialize a min-heap. */ static __always_inline void __min_heap_init_inline(min_heap_char *heap, void *data, int size) @@ -78,33 +104,30 @@ static __always_inline void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, const struct min_heap_callbacks *func, void *args) { - void *left, *right; + const unsigned long lsbit = elem_size & -elem_size; void *data = heap->data; - void *root = data + pos * elem_size; - int i = pos, j; + /* pre-scale counters for performance */ + size_t a = pos * elem_size; + size_t b, c, d; + size_t n = heap->nr * elem_size; /* Find the sift-down path all the way to the leaves. */ - for (;;) { - if (i * 2 + 2 >= heap->nr) - break; - left = data + (i * 2 + 1) * elem_size; - right = data + (i * 2 + 2) * elem_size; - i = func->less(left, right, args) ? i * 2 + 1 : i * 2 + 2; - } + for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;) + b = func->less(data + c, data + d, args) ? c : d; /* Special case for the last leaf with no sibling. */ - if (i * 2 + 2 == heap->nr) - i = i * 2 + 1; + if (d == n) + b = c; /* Backtrack to the correct location. */ - while (i != pos && func->less(root, data + i * elem_size, args)) - i = (i - 1) / 2; + while (b != a && func->less(data + a, data + b, args)) + b = parent(b, lsbit, elem_size); /* Shift the element into its correct place. */ - j = i; - while (i != pos) { - i = (i - 1) / 2; - func->swp(data + i * elem_size, data + j * elem_size, args); + c = b; + while (b != a) { + b = parent(b, lsbit, elem_size); + func->swp(data + b, data + c, args); } } @@ -117,15 +140,17 @@ static __always_inline void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args) { + const unsigned long lsbit = elem_size & -elem_size; void *data = heap->data; - size_t parent; + /* pre-scale counters for performance */ + size_t a = idx * elem_size, b; - while (idx) { - parent = (idx - 1) / 2; - if (func->less(data + parent * elem_size, data + idx * elem_size, args)) + while (a) { + b = parent(a, lsbit, elem_size); + if (func->less(data + b, data + a, args)) break; - func->swp(data + parent * elem_size, data + idx * elem_size, args); - idx = parent; + func->swp(data + a, data + b, args); + a = b; } } -- cgit v1.2.3 From 03ec56d084611b5a4dc06ffa74db0928616e4d7f Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:53 +0800 Subject: lib min_heap: avoid indirect function call by providing default swap The non-inline min heap API can result in an indirect function call to the custom swap function. This becomes particularly costly when CONFIG_MITIGATION_RETPOLINE is enabled, as indirect function calls are expensive in this case. To address this, copy the code from lib/sort.c and provide a default builtin swap implementation that performs element swaps based on the element size. This change allows most users to avoid the overhead of indirect function calls, improving efficiency. Link: https://lkml.kernel.org/r/20241020040200.939973-4-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 159 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 41b092a14238..e781727c8916 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -38,6 +38,147 @@ struct min_heap_callbacks { void (*swp)(void *lhs, void *rhs, void *args); }; +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static __always_inline +void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static __always_inline +void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static __always_inline +void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0) +#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1) +#define SWAP_BYTES ((void (*)(void *, void *, void *))2) + +/* + * Selects the appropriate swap function based on the element size. + */ +static __always_inline +void *select_swap_func(const void *base, size_t size) +{ + if (is_aligned(base, size, 8)) + return SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + return SWAP_WORDS_32; + else + return SWAP_BYTES; +} + +static __always_inline +void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args), + void *priv) +{ + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, priv); +} + /** * parent - given the offset of the child, find the offset of the parent. * @i: the offset of the heap element whose parent is sought. Non-zero. @@ -106,11 +247,15 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, { const unsigned long lsbit = elem_size & -elem_size; void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; /* pre-scale counters for performance */ size_t a = pos * elem_size; size_t b, c, d; size_t n = heap->nr * elem_size; + if (!swp) + swp = select_swap_func(data, elem_size); + /* Find the sift-down path all the way to the leaves. */ for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;) b = func->less(data + c, data + d, args) ? c : d; @@ -127,7 +272,7 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, c = b; while (b != a) { b = parent(b, lsbit, elem_size); - func->swp(data + b, data + c, args); + do_swap(data + b, data + c, elem_size, swp, args); } } @@ -142,14 +287,18 @@ void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx { const unsigned long lsbit = elem_size & -elem_size; void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; /* pre-scale counters for performance */ size_t a = idx * elem_size, b; + if (!swp) + swp = select_swap_func(data, elem_size); + while (a) { b = parent(a, lsbit, elem_size); if (func->less(data + b, data + a, args)) break; - func->swp(data + a, data + b, args); + do_swap(data + a, data + b, elem_size, swp, args); a = b; } } @@ -242,15 +391,19 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args) { void *data = heap->data; + void (*swp)(void *lhs, void *rhs, void *args) = func->swp; if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap")) return false; + if (!swp) + swp = select_swap_func(data, elem_size); + /* Place last element at the root (position 0) and then sift down. */ heap->nr--; if (idx == heap->nr) return true; - func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); + do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args); __min_heap_sift_up_inline(heap, elem_size, idx, func, args); __min_heap_sift_down_inline(heap, idx, elem_size, func, args); -- cgit v1.2.3 From e1ef630c56d36770e180f0d0bf7b61b5289f5c48 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Fri, 1 Nov 2024 11:57:13 +0200 Subject: clk: Add devm_clk_hw_register_gate_parent_hw() Add devm_clk_hw_register_gate_parent_hw() macro to allow registering devres managed gate clocks providing struct clk_hw object as parent. Reviewed-by: Geert Uytterhoeven Acked-by: Stephen Boyd Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/20241101095720.2247815-3-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/linux/clk-provider.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 7e43caabb54b..28cf7d103e92 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -622,6 +622,24 @@ struct clk *clk_register_gate(struct device *dev, const char *name, __devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \ NULL, (flags), (reg), (bit_idx), \ (clk_gate_flags), (lock)) +/** + * devm_clk_hw_register_gate_parent_hw - register a gate clock with the clock + * framework + * @dev: device that is registering this clock + * @name: name of this clock + * @parent_hw: pointer to parent clk + * @flags: framework-specific flags for this clock + * @reg: register address to control gating of this clock + * @bit_idx: which bit in the register controls gating of this clock + * @clk_gate_flags: gate-specific flags for this clock + * @lock: shared register lock for this clock + */ +#define devm_clk_hw_register_gate_parent_hw(dev, name, parent_hw, flags, \ + reg, bit_idx, clk_gate_flags, \ + lock) \ + __devm_clk_hw_register_gate((dev), NULL, (name), NULL, (parent_hw), \ + NULL, (flags), (reg), (bit_idx), \ + (clk_gate_flags), (lock)) /** * devm_clk_hw_register_gate_parent_data - register a gate clock with the * clock framework -- cgit v1.2.3 From 32360bf6a5d4016669c3545e7b0ec939937f5331 Mon Sep 17 00:00:00 2001 From: Dmitry Rokosov Date: Wed, 4 Sep 2024 01:39:30 +0300 Subject: leds: Introduce ordered workqueue for LEDs events instead of system_wq This allows to setup ordered workqueue for LEDs events. This may be useful, because default 'system_wq' does not guarantee execution order of each work_struct, thus for several brightness update requests (for multiple LEDs), real brightness switch could be in random order. Yes, for sysfs-based LEDs we have flush_work() call inside brightness_store() operation, but it's blocking call, so userspace caller can be blocked at a long time, which means LEDs animation stream can be broken. Ordered workqueue has the same behaviour as system_wq + flush_work(), but all scheduled works are async and userspace caller is not blocked, which it better for userspace animation scheduling. Signed-off-by: Alexey Romanov Signed-off-by: Dmitry Rokosov Link: https://lore.kernel.org/r/20240903223936.21292-1-ddrokosov@salutedevices.com [Lee: Couple of style fix-ups] Signed-off-by: Lee Jones --- include/linux/leds.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index e5968c3ed4ae..9f34b9facaad 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -171,6 +171,7 @@ struct led_classdev { int new_blink_brightness; void (*flash_resume)(struct led_classdev *led_cdev); + struct workqueue_struct *wq; /* LED workqueue */ struct work_struct set_brightness_work; int delayed_set_value; unsigned long delayed_delay_on; -- cgit v1.2.3 From 4ca7cd938725a4050dcd62ae9472e931d603118d Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Sun, 3 Nov 2024 21:35:27 +0530 Subject: leds: class: Protect brightness_show() with led_cdev->led_access mutex There is NULL pointer issue observed if from Process A where hid device being added which results in adding a led_cdev addition and later a another call to access of led_cdev attribute from Process B can result in NULL pointer issue. Use mutex led_cdev->led_access to protect access to led->cdev and its attribute inside brightness_show() and max_brightness_show() and also update the comment for mutex that it should be used to protect the led class device fields. Process A Process B kthread+0x114 worker_thread+0x244 process_scheduled_works+0x248 uhid_device_add_worker+0x24 hid_add_device+0x120 device_add+0x268 bus_probe_device+0x94 device_initial_probe+0x14 __device_attach+0xfc bus_for_each_drv+0x10c __device_attach_driver+0x14c driver_probe_device+0x3c __driver_probe_device+0xa0 really_probe+0x190 hid_device_probe+0x130 ps_probe+0x990 ps_led_register+0x94 devm_led_classdev_register_ext+0x58 led_classdev_register_ext+0x1f8 device_create_with_groups+0x48 device_create_groups_vargs+0xc8 device_add+0x244 kobject_uevent+0x14 kobject_uevent_env[jt]+0x224 mutex_unlock[jt]+0xc4 __mutex_unlock_slowpath+0xd4 wake_up_q+0x70 try_to_wake_up[jt]+0x48c preempt_schedule_common+0x28 __schedule+0x628 __switch_to+0x174 el0t_64_sync+0x1a8/0x1ac el0t_64_sync_handler+0x68/0xbc el0_svc+0x38/0x68 do_el0_svc+0x1c/0x28 el0_svc_common+0x80/0xe0 invoke_syscall+0x58/0x114 __arm64_sys_read+0x1c/0x2c ksys_read+0x78/0xe8 vfs_read+0x1e0/0x2c8 kernfs_fop_read_iter+0x68/0x1b4 seq_read_iter+0x158/0x4ec kernfs_seq_show+0x44/0x54 sysfs_kf_seq_show+0xb4/0x130 dev_attr_show+0x38/0x74 brightness_show+0x20/0x4c dualshock4_led_get_brightness+0xc/0x74 [ 3313.874295][ T4013] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000060 [ 3313.874301][ T4013] Mem abort info: [ 3313.874303][ T4013] ESR = 0x0000000096000006 [ 3313.874305][ T4013] EC = 0x25: DABT (current EL), IL = 32 bits [ 3313.874307][ T4013] SET = 0, FnV = 0 [ 3313.874309][ T4013] EA = 0, S1PTW = 0 [ 3313.874311][ T4013] FSC = 0x06: level 2 translation fault [ 3313.874313][ T4013] Data abort info: [ 3313.874314][ T4013] ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000 [ 3313.874316][ T4013] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 3313.874318][ T4013] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 3313.874320][ T4013] user pgtable: 4k pages, 39-bit VAs, pgdp=00000008f2b0a000 .. [ 3313.874332][ T4013] Dumping ftrace buffer: [ 3313.874334][ T4013] (ftrace buffer empty) .. .. [ dd3313.874639][ T4013] CPU: 6 PID: 4013 Comm: InputReader [ 3313.874648][ T4013] pc : dualshock4_led_get_brightness+0xc/0x74 [ 3313.874653][ T4013] lr : led_update_brightness+0x38/0x60 [ 3313.874656][ T4013] sp : ffffffc0b910bbd0 .. .. [ 3313.874685][ T4013] Call trace: [ 3313.874687][ T4013] dualshock4_led_get_brightness+0xc/0x74 [ 3313.874690][ T4013] brightness_show+0x20/0x4c [ 3313.874692][ T4013] dev_attr_show+0x38/0x74 [ 3313.874696][ T4013] sysfs_kf_seq_show+0xb4/0x130 [ 3313.874700][ T4013] kernfs_seq_show+0x44/0x54 [ 3313.874703][ T4013] seq_read_iter+0x158/0x4ec [ 3313.874705][ T4013] kernfs_fop_read_iter+0x68/0x1b4 [ 3313.874708][ T4013] vfs_read+0x1e0/0x2c8 [ 3313.874711][ T4013] ksys_read+0x78/0xe8 [ 3313.874714][ T4013] __arm64_sys_read+0x1c/0x2c [ 3313.874718][ T4013] invoke_syscall+0x58/0x114 [ 3313.874721][ T4013] el0_svc_common+0x80/0xe0 [ 3313.874724][ T4013] do_el0_svc+0x1c/0x28 [ 3313.874727][ T4013] el0_svc+0x38/0x68 [ 3313.874730][ T4013] el0t_64_sync_handler+0x68/0xbc [ 3313.874732][ T4013] el0t_64_sync+0x1a8/0x1ac Signed-off-by: Mukesh Ojha Reviewed-by: Anish Kumar Link: https://lore.kernel.org/r/20241103160527.82487-1-quic_mojha@quicinc.com Signed-off-by: Lee Jones --- include/linux/leds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 9f34b9facaad..98f9719c924c 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -239,7 +239,7 @@ struct led_classdev { struct kernfs_node *brightness_hw_changed_kn; #endif - /* Ensures consistent access to the LED Flash Class device */ + /* Ensures consistent access to the LED class device */ struct mutex led_access; }; -- cgit v1.2.3 From 33b091c08ed85e023c21376e6f787355fd46b440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Fri, 1 Nov 2024 13:42:49 -0300 Subject: libfs: Fix kernel-doc warning in generic_ci_validate_strict_name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the indentation of the return values from generic_ci_validate_strict_name() to properly render the comment and to address a `make htmldocs` warning: Documentation/filesystems/api-summary:14: include/linux/fs.h:3504: WARNING: Bullet list ends without a blank line; unexpected unindent. Fixes: 0e152beb5aa1 ("libfs: Create the helper function generic_ci_validate_strict_name()") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20241030162435.05425f60@canb.auug.org.au/ Signed-off-by: André Almeida Link: https://lore.kernel.org/r/20241101164251.327884-2-andrealmeid@igalia.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b277369672a1..001d580af862 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3477,12 +3477,12 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, * @name: name of the new file * * Return: - * * True if the filename is suitable for this directory. It can be - * true if a given name is not suitable for a strict encoding - * directory, but the directory being used isn't strict + * * True: if the filename is suitable for this directory. It can be + * true if a given name is not suitable for a strict encoding + * directory, but the directory being used isn't strict * * False if the filename isn't suitable for this directory. This only - * happens when a directory is casefolded and the filesystem is strict - * about its encoding. + * happens when a directory is casefolded and the filesystem is strict + * about its encoding. */ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { -- cgit v1.2.3 From 6140be90ec70c39fa844741ca3cc807dd0866394 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Fri, 26 Apr 2024 18:20:14 +0200 Subject: fs/xattr: add *at family syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the four syscalls setxattrat(), getxattrat(), listxattrat() and removexattrat(). Those can be used to operate on extended attributes, especially security related ones, either relative to a pinned directory or on a file descriptor without read access, avoiding a /proc//fd/ detour, requiring a mounted procfs. One use case will be setfiles(8) setting SELinux file contexts ("security.selinux") without race conditions and without a file descriptor opened with read access requiring SELinux read permission. Use the do_{name}at() pattern from fs/open.c. Pass the value of the extended attribute, its length, and for setxattrat(2) the command (XATTR_CREATE or XATTR_REPLACE) via an added struct xattr_args to not exceed six syscall arguments and not merging the AT_* and XATTR_* flags. [AV: fixes by Christian Brauner folded in, the entire thing rebased on top of {filename,file}_...xattr() primitives, treatment of empty pathnames regularized. As the result, AT_EMPTY_PATH+NULL handling is cheap, so f...(2) can use it] Signed-off-by: Christian Göttsche Link: https://lore.kernel.org/r/20240426162042.191916-1-cgoettsche@seltendoof.de Reviewed-by: Arnd Bergmann Reviewed-by: Christian Brauner CC: x86@kernel.org CC: linux-alpha@vger.kernel.org CC: linux-kernel@vger.kernel.org CC: linux-arm-kernel@lists.infradead.org CC: linux-ia64@vger.kernel.org CC: linux-m68k@lists.linux-m68k.org CC: linux-mips@vger.kernel.org CC: linux-parisc@vger.kernel.org CC: linuxppc-dev@lists.ozlabs.org CC: linux-s390@vger.kernel.org CC: linux-sh@vger.kernel.org CC: sparclinux@vger.kernel.org CC: linux-fsdevel@vger.kernel.org CC: audit@vger.kernel.org CC: linux-arch@vger.kernel.org CC: linux-api@vger.kernel.org CC: linux-security-module@vger.kernel.org CC: selinux@vger.kernel.org [brauner: slight tweaks] Signed-off-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/syscalls.h | 13 +++++++++++++ include/linux/xattr.h | 4 ++++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 5758104921e6..c6333204d451 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,6 +77,7 @@ struct cachestat_range; struct cachestat; struct statmount; struct mnt_id_req; +struct xattr_args; #include #include @@ -338,23 +339,35 @@ asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, void __user *arg, unsigned int nr_args); asmlinkage long sys_setxattr(const char __user *path, const char __user *name, const void __user *value, size_t size, int flags); +asmlinkage long sys_setxattrat(int dfd, const char __user *path, unsigned int at_flags, + const char __user *name, + const struct xattr_args __user *args, size_t size); asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name, const void __user *value, size_t size, int flags); asmlinkage long sys_fsetxattr(int fd, const char __user *name, const void __user *value, size_t size, int flags); asmlinkage long sys_getxattr(const char __user *path, const char __user *name, void __user *value, size_t size); +asmlinkage long sys_getxattrat(int dfd, const char __user *path, unsigned int at_flags, + const char __user *name, + struct xattr_args __user *args, size_t size); asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name, void __user *value, size_t size); asmlinkage long sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size); asmlinkage long sys_listxattr(const char __user *path, char __user *list, size_t size); +asmlinkage long sys_listxattrat(int dfd, const char __user *path, + unsigned int at_flags, + char __user *list, size_t size); asmlinkage long sys_llistxattr(const char __user *path, char __user *list, size_t size); asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size); asmlinkage long sys_removexattr(const char __user *path, const char __user *name); +asmlinkage long sys_removexattrat(int dfd, const char __user *path, + unsigned int at_flags, + const char __user *name); asmlinkage long sys_lremovexattr(const char __user *path, const char __user *name); asmlinkage long sys_fremovexattr(int fd, const char __user *name); diff --git a/include/linux/xattr.h b/include/linux/xattr.h index d20051865800..86b0d47984a1 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -19,6 +19,10 @@ #include #include +/* List of all open_how "versions". */ +#define XATTR_ARGS_SIZE_VER0 16 /* sizeof first published struct */ +#define XATTR_ARGS_SIZE_LATEST XATTR_ARGS_SIZE_VER0 + struct inode; struct dentry; -- cgit v1.2.3 From 6f94cbc29adacc15007c5a16295052e674099282 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 3 Nov 2024 08:46:07 -0700 Subject: io_uring/rsrc: split io_kiocb node type assignments Currently the io_rsrc_node assignment in io_kiocb is an array of two pointers, as two nodes may be assigned to a request - one file node, and one buffer node. However, the buffer node can co-exist with the provided buffers, as currently it's not supported to use both provided and registered buffers at the same time. This crucially brings struct io_kiocb down to 4 cache lines again, as before it spilled into the 5th cacheline. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d52fec533c51..01e7fb9fcfe2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -475,6 +475,7 @@ enum { REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, + REQ_F_BUF_NODE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -553,6 +554,8 @@ enum { REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), /* buffer ring head needs incrementing on put */ REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), + /* buf node is valid */ + REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); @@ -633,6 +636,8 @@ struct io_kiocb { * REQ_F_BUFFER_RING is set. */ struct io_buffer_list *buf_list; + + struct io_rsrc_node *buf_node; }; union { @@ -642,7 +647,7 @@ struct io_kiocb { __poll_t apoll_events; }; - struct io_rsrc_node *rsrc_nodes[2]; + struct io_rsrc_node *file_node; atomic_t refs; bool cancel_seq_set; -- cgit v1.2.3 From b6f58a3f4aa8dba424356c7a69388a81f4459300 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 3 Nov 2024 10:23:38 -0700 Subject: io_uring: move struct io_kiocb from task_struct to io_uring_task Rather than store the task_struct itself in struct io_kiocb, store the io_uring specific task_struct. The life times are the same in terms of io_uring, and this avoids doing some dereferences through the task_struct. For the hot path of putting local task references, we can deref req->tctx instead, which we'll need anyway in that function regardless of whether it's local or remote references. This is mostly straight forward, except the original task PF_EXITING check needs a bit of tweaking. task_work is _always_ run from the originating task, except in the fallback case, where it's run from a kernel thread. Replace the potentially racy (in case of fallback work) checks for req->task->flags with current->flags. It's either the still the original task, in which case PF_EXITING will be sane, or it has PF_KTHREAD set, in which case it's fallback work. Both cases should prevent moving forward with the given request. Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 2 +- include/linux/io_uring_types.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c189d36ad55e..578a3fdf5c71 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -110,7 +110,7 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) { - return cmd_to_io_kiocb(cmd)->task; + return cmd_to_io_kiocb(cmd)->tctx->task; } #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 01e7fb9fcfe2..fba2988accc3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -84,6 +84,7 @@ struct io_uring_task { /* submission side */ int cached_refs; const struct io_ring_ctx *last; + struct task_struct *task; struct io_wq *io_wq; struct file *registered_rings[IO_RINGFD_REG_MAX]; @@ -625,7 +626,7 @@ struct io_kiocb { struct io_cqe cqe; struct io_ring_ctx *ctx; - struct task_struct *task; + struct io_uring_task *tctx; union { /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ -- cgit v1.2.3 From 6bf90bd8c58a305994948eb3409d91a7d8f2edae Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Sun, 13 Oct 2024 14:29:24 -0400 Subject: io_uring/napi: add static napi tracking strategy Add the static napi tracking strategy. That allows the user to manually manage the napi ids list for busy polling, and eliminate the overhead of dynamically updating the list from the fast path. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/96943de14968c35a5c599352259ad98f3c0770ba.1728828877.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fba2988accc3..072e65e93105 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -408,7 +408,7 @@ struct io_ring_ctx { /* napi busy poll default timeout */ ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; - bool napi_enabled; + u8 napi_track_mode; DECLARE_HASHTABLE(napi_ht, 4); #endif -- cgit v1.2.3 From 3b96b895127b7c0aed63d82c974b46340e8466c1 Mon Sep 17 00:00:00 2001 From: Esther Shimanovich Date: Tue, 10 Sep 2024 17:57:45 +0000 Subject: PCI: Detect and trust built-in Thunderbolt chips Some computers with CPUs that lack Thunderbolt features use discrete Thunderbolt chips to add Thunderbolt functionality. These Thunderbolt chips are located within the chassis; between the Root Port labeled ExternalFacingPort and the USB-C port. These Thunderbolt PCIe devices should be labeled as fixed and trusted, as they are built into the computer. Otherwise, security policies that rely on those flags may have unintended results, such as preventing USB-C ports from enumerating. Detect the above scenario through the process of elimination. 1) Integrated Thunderbolt host controllers already have Thunderbolt implemented, so anything outside their external facing Root Port is removable and untrusted. Detect them using the following properties: - Most integrated host controllers have the "usb4-host-interface" ACPI property, as described here: https://learn.microsoft.com/en-us/windows-hardware/drivers/pci/dsd-for-pcie-root-ports#mapping-native-protocols-pcie-displayport-tunneled-through-usb4-to-usb4-host-routers - Integrated Thunderbolt PCIe Root Ports before Alder Lake do not have the "usb4-host-interface" ACPI property. Identify those by their PCI IDs instead. 2) If a Root Port does not have integrated Thunderbolt capabilities, but has the "ExternalFacingPort" ACPI property, that means the manufacturer has opted to use a discrete Thunderbolt host controller that is built into the computer. This host controller can be identified by virtue of being located directly below an external-facing Root Port that lacks integrated Thunderbolt. Label it as trusted and fixed. Everything downstream from it is untrusted and removable. The "ExternalFacingPort" ACPI property is described here: https://learn.microsoft.com/en-us/windows-hardware/drivers/pci/dsd-for-pcie-root-ports#identifying-externally-exposed-pcie-root-ports Link: https://lore.kernel.org/r/20240910-trust-tbt-fix-v5-1-7a7a42a5f496@chromium.org Suggested-by: Mika Westerberg Signed-off-by: Esther Shimanovich Signed-off-by: Bjorn Helgaas Tested-by: Mika Westerberg Tested-by: Mario Limonciello Reviewed-by: Mika Westerberg Reviewed-by: Mario Limonciello --- include/linux/pci.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 573b4c4c2be6..4e77c4230c0a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2609,6 +2609,12 @@ pci_host_bridge_acpi_msi_domain(struct pci_bus *bus) { return NULL; } static inline bool pci_pr3_present(struct pci_dev *pdev) { return false; } #endif +#if defined(CONFIG_X86) && defined(CONFIG_ACPI) +bool arch_pci_dev_is_removable(struct pci_dev *pdev); +#else +static inline bool arch_pci_dev_is_removable(struct pci_dev *pdev) { return false; } +#endif + #ifdef CONFIG_EEH static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev) { -- cgit v1.2.3 From 25caea955cc950507d179f3ef456404b475e8c23 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 31 Oct 2024 14:08:58 +0800 Subject: irqchip: Add T-HEAD C900 ACLINT SSWI driver Add a driver for the T-HEAD C900 ACLINT SSWI device. This device allows the system with T-HEAD cpus to send ipi via fast device interface. Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241031060859.722258-3-inochiama@gmail.com --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2361ed4d2b15..799052249c7b 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -147,6 +147,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_EIOINTC_STARTING, CPUHP_AP_IRQ_AVECINTC_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, + CPUHP_AP_IRQ_THEAD_ACLINT_SSWI_STARTING, CPUHP_AP_IRQ_RISCV_IMSIC_STARTING, CPUHP_AP_IRQ_RISCV_SBI_IPI_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, -- cgit v1.2.3 From 513793bc6ab331b947111e8efaf8fcef33fb83e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:31 +0100 Subject: posix-timers: Make signal delivery consistent Signals of timers which are reprogammed, disarmed or deleted can deliver signals related to the past. The POSIX spec is blury about this: - "The effect of disarming or resetting a timer with pending expiration notifications is unspecified." - "The disposition of pending signals for the deleted timer is unspecified." In both cases it is reasonable to expect that pending signals are discarded. Especially in the reprogramming case it does not make sense to account for previous overruns or to deliver a signal for a timer which has been disarmed. This makes the behaviour consistent and understandable. Remove the si_sys_private check from the signal delivery code and invoke posix_timer_deliver_signal() unconditionally for posix timer related signals. Change posix_timer_deliver_signal() so it controls the actual signal delivery via the return value. It now instructs the signal code to drop the signal when: 1) The timer does not longer exist in the hash table 2) The timer signal_seq value is not the same as the si_sys_private value which was set when the signal was queued. This is also a preparatory change to embed the sigqueue into the k_itimer structure, which in turn allows to remove the si_sys_private magic. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064213.040348644@linutronix.de --- include/linux/posix-timers.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 02afbb4da7f7..8c6d97412526 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -137,8 +137,6 @@ static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif -#define REQUEUE_PENDING 1 - /** * struct k_itimer - POSIX.1b interval timer structure. * @list: List head for binding the timer to signals->posix_timers -- cgit v1.2.3 From bf635681c906ad056d1fda325de8d1c12c9f8201 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:33 +0100 Subject: posix-cpu-timers: Cleanup the firing logic The firing flag of a posix CPU timer is tristate: 0: when the timer is not about to deliver a signal 1: when the timer has expired, but the signal has not been delivered yet -1: when the timer was queued for signal delivery and a rearm operation raced against it and supressed the signal delivery. This is a pointless exercise as this can be simply expressed with a boolean. Only if set, the signal is delivered. This makes delete and rearm consistent with the rest of the posix timers. Convert firing to bool and fixup the usage sites accordingly and add comments why the timer cannot be dequeued right away. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064213.172848618@linutronix.de --- include/linux/posix-timers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 8c6d97412526..b1de21731a08 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -49,7 +49,7 @@ struct cpu_timer { struct timerqueue_head *head; struct pid *pid; struct list_head elist; - int firing; + bool firing; struct task_struct __rcu *handling; }; -- cgit v1.2.3 From 4cf7bf2a2f1a8ace4a49a1138c8123fdb5990093 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:35 +0100 Subject: posix-cpu-timers: Use dedicated flag for CPU timer nanosleep POSIX CPU timer nanosleep creates a k_itimer on stack and uses the sigq pointer to detect the nanosleep case in the expiry function. Prepare for embedding sigqueue into struct k_itimer by using a dedicated flag for nanosleep. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.238550394@linutronix.de --- include/linux/posix-timers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index b1de21731a08..bcd01208d795 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -42,6 +42,7 @@ static inline int clockid_to_fd(const clockid_t clk) * @pid: Pointer to target task PID * @elist: List head for the expiry list * @firing: Timer is currently firing + * @nanosleep: Timer is used for nanosleep and is not a regular posix-timer * @handling: Pointer to the task which handles expiry */ struct cpu_timer { @@ -50,6 +51,7 @@ struct cpu_timer { struct pid *pid; struct list_head elist; bool firing; + bool nanosleep; struct task_struct __rcu *handling; }; -- cgit v1.2.3 From 5d916a0988eed5217c103932ff4887c9ae83c89c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:36 +0100 Subject: posix-timers: Add a refcount to struct k_itimer To cure the SIG_IGN handling for posix interval timers, the preallocated sigqueue needs to be embedded into struct k_itimer to prevent life time races of all sorts. To make that work correctly it needs reference counting so that timer deletion does not free the timer prematuraly when there is a signal queued or delivered concurrently. Add a rcuref to the posix timer part. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.304756440@linutronix.de --- include/linux/posix-timers.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index bcd01208d795..9740fd0c2933 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -6,11 +6,13 @@ #include #include #include +#include #include #include struct kernel_siginfo; struct task_struct; +struct k_itimer; static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) @@ -105,6 +107,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_deliver_signal(struct kernel_siginfo *info); +void posixtimer_free_timer(struct k_itimer *timer); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -129,6 +132,7 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; } +static inline void posixtimer_free_timer(struct k_itimer *timer) { } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK @@ -156,6 +160,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_signal: Pointer to the creators signal struct * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) + * @rcuref: Reference count for life time management * @sigq: Pointer to preallocated sigqueue * @it: Union representing the various posix timer type * internals. @@ -180,6 +185,7 @@ struct k_itimer { struct task_struct *it_process; }; struct sigqueue *sigq; + rcuref_t rcuref; union { struct { struct hrtimer timer; @@ -200,4 +206,12 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); +#ifdef CONFIG_POSIX_TIMERS +static inline void posixtimer_putref(struct k_itimer *tmr) +{ + if (rcuref_put(&tmr->rcuref)) + posixtimer_free_timer(tmr); +} +#endif /* !CONFIG_POSIX_TIMERS */ + #endif -- cgit v1.2.3 From 54f1dd642fd088ba969206f09e7afffad7d9db2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:39 +0100 Subject: signal: Provide posixtimer_sigqueue_init() To cure the SIG_IGN handling for posix interval timers, the preallocated sigqueue needs to be embedded into struct k_itimer to prevent life time races of all sorts. Provide a new function to initialize the embedded sigqueue to prepare for that. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.450427515@linutronix.de --- include/linux/posix-timers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 9740fd0c2933..200098d27cc0 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -12,6 +12,7 @@ struct kernel_siginfo; struct task_struct; +struct sigqueue; struct k_itimer; static inline clockid_t make_process_cpuclock(const unsigned int pid, @@ -106,6 +107,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, } void posixtimer_rearm_itimer(struct task_struct *p); +bool posixtimer_init_sigqueue(struct sigqueue *q); bool posixtimer_deliver_signal(struct kernel_siginfo *info); void posixtimer_free_timer(struct k_itimer *timer); -- cgit v1.2.3 From ef1c5bcd6daa674392bdf89b8ae889aafd73f956 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:41 +0100 Subject: posix-timers: Store PID type in the timer instead of re-evaluating the signal delivery mode everywhere. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.519086500@linutronix.de --- include/linux/posix-timers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 200098d27cc0..947176582de9 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -180,6 +181,7 @@ struct k_itimer { s64 it_overrun_last; unsigned int it_signal_seq; int it_sigev_notify; + enum pid_type it_pid_type; ktime_t it_interval; struct signal_struct *it_signal; union { -- cgit v1.2.3 From 0360ed14d9826678a50fa2b873e522a24cd3c018 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:42 +0100 Subject: signal: Refactor send_sigqueue() To handle posix timers which have their signal ignored via SIG_IGN properly it is required to requeue a ignored signal for delivery when SIG_IGN is lifted so the timer gets rearmed. Split the required code out of send_sigqueue() so it can be reused in context of sigaction(). While at it rename send_sigqueue() to posixtimer_send_sigqueue() so its clear what this is about. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.586453412@linutronix.de --- include/linux/posix-timers.h | 1 + include/linux/sched/signal.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 947176582de9..52611ea923b2 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -109,6 +109,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); +int posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info); void posixtimer_free_timer(struct k_itimer *timer); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index bd9f569231d9..36283c1c55e9 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -340,7 +340,6 @@ extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type, int si_private); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) -- cgit v1.2.3 From 11629b9808e5900d675fd469d19932ea48060de3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:43 +0100 Subject: signal: Replace resched_timer logic In preparation for handling ignored posix timer signals correctly and embedding the sigqueue struct into struct k_itimer, hand down a pointer to the sigqueue struct into posix_timer_deliver_signal() instead of just having a boolean flag. No functional change. Suggested-by: Eric W. Biederman Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: "Eric W. Biederman" Link: https://lore.kernel.org/all/20241105064213.652658158@linutronix.de --- include/linux/posix-timers.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 52611ea923b2..39f1db76833a 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -110,7 +110,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); int posixtimer_send_sigqueue(struct k_itimer *tmr); -bool posixtimer_deliver_signal(struct kernel_siginfo *info); +bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); /* Init task static initializer */ @@ -135,7 +135,8 @@ static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } static inline void posixtimer_rearm_itimer(struct task_struct *p) { } -static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info) { return false; } +static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, + struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } #endif -- cgit v1.2.3 From 6017a158beb13b412e55a451379798aae5876514 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:45 +0100 Subject: posix-timers: Embed sigqueue in struct k_itimer To cure the SIG_IGN handling for posix interval timers, the preallocated sigqueue needs to be embedded into struct k_itimer to prevent life time races of all sorts. Now that the prerequisites are in place, embed the sigqueue into struct k_itimer and fixup the relevant usage sites. Aside of preparing for proper SIG_IGN handling, this spares an extra allocation. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.719695194@linutronix.de --- include/linux/posix-timers.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 39f1db76833a..28c0a30e0853 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -39,6 +39,8 @@ static inline int clockid_to_fd(const clockid_t clk) #ifdef CONFIG_POSIX_TIMERS +#include + /** * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig @@ -166,7 +168,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) * @rcuref: Reference count for life time management - * @sigq: Pointer to preallocated sigqueue + * @sigq: Embedded sigqueue * @it: Union representing the various posix timer type * internals. * @rcu: RCU head for freeing the timer. @@ -190,7 +192,7 @@ struct k_itimer { struct pid *it_pid; struct task_struct *it_process; }; - struct sigqueue *sigq; + struct sigqueue sigq; rcuref_t rcuref; union { struct { @@ -218,6 +220,23 @@ static inline void posixtimer_putref(struct k_itimer *tmr) if (rcuref_put(&tmr->rcuref)) posixtimer_free_timer(tmr); } + +static inline void posixtimer_sigqueue_getref(struct sigqueue *q) +{ + struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); + + WARN_ON_ONCE(!rcuref_get(&tmr->rcuref)); +} + +static inline void posixtimer_sigqueue_putref(struct sigqueue *q) +{ + struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); + + posixtimer_putref(tmr); +} +#else /* CONFIG_POSIX_TIMERS */ +static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } +static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } #endif /* !CONFIG_POSIX_TIMERS */ #endif -- cgit v1.2.3 From c2a4796a154bb952be1106911841aab2c8c17c4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:46 +0100 Subject: signal: Cleanup unused posix-timer leftovers Remove the leftovers of sigqueue preallocation as it's not longer used. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.786506636@linutronix.de --- include/linux/sched/signal.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 36283c1c55e9..02972fd41931 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -338,8 +338,6 @@ extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); -extern struct sigqueue *sigqueue_alloc(void); -extern void sigqueue_free(struct sigqueue *); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) -- cgit v1.2.3 From 647da5f709f112319c0d51e06f330d8afecb1940 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:48 +0100 Subject: posix-timers: Move sequence logic into struct k_itimer The posix timer signal handling uses siginfo::si_sys_private for handling the sequence counter check. That indirection is not longer required and the sequence count value at signal queueing time can be stored in struct k_itimer itself. This removes the requirement of treating siginfo::si_sys_private special as it's now always zero as the kernel does not touch it anymore. Suggested-by: Eric W. Biederman Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: "Eric W. Biederman" Link: https://lore.kernel.org/all/20241105064213.852619866@linutronix.de --- include/linux/posix-timers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 28c0a30e0853..49a89614d900 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -162,6 +162,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery + * @it_sigqueue_seq: The sequence count at the point where the signal was queued * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct @@ -184,6 +185,7 @@ struct k_itimer { s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; + unsigned int it_sigqueue_seq; int it_sigev_notify; enum pid_type it_pid_type; ktime_t it_interval; -- cgit v1.2.3 From 69f032c92cf883ea74a4b69ba3d91317aa6f174e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:49 +0100 Subject: signal: Provide ignored_posix_timers list To prepare for handling posix timer signals on sigaction(SIG_IGN) properly, add a list to task::signal. This list will be used to queue posix timers so their signal can be requeued when SIG_IGN is lifted later. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.920101900@linutronix.de --- include/linux/sched/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 02972fd41931..d5d03d919df8 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -138,6 +138,7 @@ struct signal_struct { /* POSIX.1b Interval Timers */ unsigned int next_posix_timer_id; struct hlist_head posix_timers; + struct hlist_head ignored_posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; -- cgit v1.2.3 From 0e20cd33acc7a173b23900550331ee82a23e9f00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:51 +0100 Subject: posix-timers: Handle ignored list on delete and exit To handle posix timer signals on sigaction(SIG_IGN) properly, the timers will be queued on a separate ignored list. Add the necessary cleanup code for timer_delete() and exit_itimers(). Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064213.987530588@linutronix.de --- include/linux/posix-timers.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 49a89614d900..1608b52a44d5 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -152,7 +152,8 @@ static inline void posix_cputimers_init_work(void) { } /** * struct k_itimer - POSIX.1b interval timer structure. - * @list: List head for binding the timer to signals->posix_timers + * @list: List node for binding the timer to tsk::signal::posix_timers + * @ignored_list: List node for tracking ignored timers in tsk::signal::ignored_posix_timers * @t_hash: Entry in the posix timer hash table * @it_lock: Lock protecting the timer * @kclock: Pointer to the k_clock struct handling this timer @@ -176,6 +177,7 @@ static inline void posix_cputimers_init_work(void) { } */ struct k_itimer { struct hlist_node list; + struct hlist_node ignored_list; struct hlist_node t_hash; spinlock_t it_lock; const struct k_clock *kclock; -- cgit v1.2.3 From df7a996b4dab03c889fa86d849447b716f07b069 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:54 +0100 Subject: signal: Queue ignored posixtimers on ignore list Queue posixtimers which have their signal ignored on the ignored list: 1) When the timer fires and the signal has SIG_IGN set 2) When SIG_IGN is installed via sigaction() and a timer signal is already queued This only happens when the signal is for a valid timer, which delivered the signal in periodic mode. One-shot timer signals are correctly dropped. Due to the lock order constraints (sighand::siglock nests inside timer::lock) the signal code cannot access any of the timer fields which are relevant to make this decision, e.g. timer::it_status. This is addressed by establishing a protection scheme which requires to lock both locks on the timer side for modifying decision fields in the timer struct and therefore makes it possible for the signal delivery to evaluate with only sighand:siglock being held: 1) Move the NULLification of timer->it_signal into the sighand::siglock protected section of timer_delete() and check timer::it_signal in the code path which determines whether the signal is dropped or queued on the ignore list. This ensures that a deleted timer cannot be moved onto the ignore list, which would prevent it from being freed on exit() as it is not longer in the process' posix timer list. If the timer got moved to the ignored list before deletion then it is removed from the ignored list under sighand lock in timer_delete(). 2) Provide a new timer::it_sig_periodic flag, which gets set in the signal queue path with both timer and sighand locks held if the timer is actually in periodic mode at expiry time. The ignore list code checks this flag under sighand::siglock and drops the signal when it is not set. If it is set, then the signal is moved to the ignored list independent of the actual state of the timer. When the signal is un-ignored later then the signal is moved back to the signal queue. On signal delivery the posix timer side decides about dropping the signal if the timer was re-armed, dis-armed or deleted based on the signal sequence counter check. If the thread/process exits then not yet delivered signals are discarded which means the reference of the timer containing the sigqueue is dropped and frees the timer. This is way cheaper than requiring all code paths to lock sighand::siglock of the target thread/process on any modification of timer::it_status or going all the way and removing pending signals from the signal queues on every rearm, disarm or delete operation. So the protection scheme here is that on the timer side both timer::lock and sighand::siglock have to be held for modifying timer::it_signal timer::it_sig_periodic which means that on the signal side holding sighand::siglock is enough to evaluate these fields. In posixtimer_deliver_signal() holding timer::lock is sufficient to do the sequence validation against timer::it_signal_seq because a concurrent expiry is waiting on timer::lock to be released. This completes the SIG_IGN handling and such timers are not longer self rearmed which avoids pointless wakeups. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064214.120756416@linutronix.de --- include/linux/posix-timers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 1608b52a44d5..43ea6e784a25 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -160,6 +160,7 @@ static inline void posix_cputimers_init_work(void) { } * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer * @it_status: The status of the timer + * @it_sig_periodic: The periodic status at signal delivery * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_signal_seq: Sequence count to control signal delivery @@ -184,6 +185,7 @@ struct k_itimer { clockid_t it_clock; timer_t it_id; int it_status; + bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; -- cgit v1.2.3 From 7a66f72b09bb0762360274b1fb677b3433dbaa06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:55 +0100 Subject: posix-timers: Cleanup SIG_IGN workaround leftovers Now that ignored posix timer signals are requeued and the timers are rearmed on signal delivery the workaround to keep such timers alive and self rearm them is not longer required. Remove the relevant hacks and the not longer required return values from the related functions. The alarm timer workarounds will be cleaned up in a separate step. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20241105064214.187239060@linutronix.de --- include/linux/posix-timers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 43ea6e784a25..f11f10c97bd9 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -111,7 +111,7 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, void posixtimer_rearm_itimer(struct task_struct *p); bool posixtimer_init_sigqueue(struct sigqueue *q); -int posixtimer_send_sigqueue(struct k_itimer *tmr); +void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); -- cgit v1.2.3 From 2634303f8773b0c602069887565cd412440be15d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:58 +0100 Subject: alarmtimers: Remove return value from alarm functions Now that the SIG_IGN problem is solved in the core code, the alarmtimer callbacks do not require a return value anymore. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064214.318837272@linutronix.de --- include/linux/alarmtimer.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 05e758b8b894..3ffa5341dce2 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -20,12 +20,6 @@ enum alarmtimer_type { ALARM_BOOTTIME_FREEZER, }; -enum alarmtimer_restart { - ALARMTIMER_NORESTART, - ALARMTIMER_RESTART, -}; - - #define ALARMTIMER_STATE_INACTIVE 0x00 #define ALARMTIMER_STATE_ENQUEUED 0x01 @@ -42,14 +36,14 @@ enum alarmtimer_restart { struct alarm { struct timerqueue_node node; struct hrtimer timer; - enum alarmtimer_restart (*function)(struct alarm *, ktime_t now); + void (*function)(struct alarm *, ktime_t now); enum alarmtimer_type type; int state; void *data; }; void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); + void (*function)(struct alarm *, ktime_t)); void alarm_start(struct alarm *alarm, ktime_t start); void alarm_start_relative(struct alarm *alarm, ktime_t start); void alarm_restart(struct alarm *alarm); -- cgit v1.2.3 From 1e4033b53db48cc77c436feac9c6d7d63db87b87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2024 17:43:57 +0000 Subject: net: skb_reset_mac_len() must check if mac_header was set Recent discussions show that skb_reset_mac_len() should be more careful. We expect the MAC header being set. If not, clear skb->mac_len and fire a warning for CONFIG_DEBUG_NET=y builds. If after investigations we find that not having a MAC header was okay, we can remove the warning. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89iJZGH+yEfJxfPWa3Hm7jxb-aeY2Up4HufmLMnVuQXt38A@mail.gmail.com/T/ Cc: En-Wei Wu Reviewed-by: Joe Damato Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20241105174403.850330-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 48f1e0fa2a13..5d8fefa71cac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2909,9 +2909,19 @@ static inline void skb_reset_inner_headers(struct sk_buff *skb) skb->inner_transport_header = skb->transport_header; } +static inline int skb_mac_header_was_set(const struct sk_buff *skb) +{ + return skb->mac_header != (typeof(skb->mac_header))~0U; +} + static inline void skb_reset_mac_len(struct sk_buff *skb) { - skb->mac_len = skb->network_header - skb->mac_header; + if (!skb_mac_header_was_set(skb)) { + DEBUG_NET_WARN_ON_ONCE(1); + skb->mac_len = 0; + } else { + skb->mac_len = skb->network_header - skb->mac_header; + } } static inline unsigned char *skb_inner_transport_header(const struct sk_buff @@ -3014,11 +3024,6 @@ static inline void skb_set_network_header(struct sk_buff *skb, const int offset) skb->network_header += offset; } -static inline int skb_mac_header_was_set(const struct sk_buff *skb) -{ - return skb->mac_header != (typeof(skb->mac_header))~0U; -} - static inline unsigned char *skb_mac_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); -- cgit v1.2.3 From cfe8394e06f22847ecae0312bdd0b633b471b3f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2024 17:43:58 +0000 Subject: net: add debug check in skb_reset_inner_transport_header() Make sure (skb->data - skb->head) can fit in skb->inner_transport_header This needs CONFIG_DEBUG_NET=y. Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20241105174403.850330-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5d8fefa71cac..75795568803c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2937,7 +2937,10 @@ static inline int skb_inner_transport_offset(const struct sk_buff *skb) static inline void skb_reset_inner_transport_header(struct sk_buff *skb) { - skb->inner_transport_header = skb->data - skb->head; + long offset = skb->data - skb->head; + + DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_transport_header))offset); + skb->inner_transport_header = offset; } static inline void skb_set_inner_transport_header(struct sk_buff *skb, -- cgit v1.2.3 From 1732e4bedb3e29986da6ea315e8bf8caf74e1e38 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2024 17:43:59 +0000 Subject: net: add debug check in skb_reset_inner_network_header() Make sure (skb->data - skb->head) can fit in skb->inner_network_header This needs CONFIG_DEBUG_NET=y. Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20241105174403.850330-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 75795568803c..8dafd1295a6e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2957,7 +2957,10 @@ static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb) static inline void skb_reset_inner_network_header(struct sk_buff *skb) { - skb->inner_network_header = skb->data - skb->head; + long offset = skb->data - skb->head; + + DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_network_header))offset); + skb->inner_network_header = offset; } static inline void skb_set_inner_network_header(struct sk_buff *skb, -- cgit v1.2.3 From 78a0cb2f45dc9327f26956c242f7f4b450efff49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2024 17:44:00 +0000 Subject: net: add debug check in skb_reset_inner_mac_header() Make sure (skb->data - skb->head) can fit in skb->inner_mac_header This needs CONFIG_DEBUG_NET=y. Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20241105174403.850330-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8dafd1295a6e..32a7d8a65b9f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2982,7 +2982,10 @@ static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb) static inline void skb_reset_inner_mac_header(struct sk_buff *skb) { - skb->inner_mac_header = skb->data - skb->head; + long offset = skb->data - skb->head; + + DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_mac_header))offset); + skb->inner_mac_header = offset; } static inline void skb_set_inner_mac_header(struct sk_buff *skb, -- cgit v1.2.3 From ae50ea52bdd77eabd8f3c4630c1b03c3373f61a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2024 17:44:01 +0000 Subject: net: add debug check in skb_reset_transport_header() Make sure (skb->data - skb->head) can fit in skb->transport_header This needs CONFIG_DEBUG_NET=y. Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20241105174403.850330-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 32a7d8a65b9f..f76524844e7e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3007,7 +3007,10 @@ static inline unsigned char *skb_transport_header(const struct sk_buff *skb) static inline void skb_reset_transport_header(struct sk_buff *skb) { - skb->transport_header = skb->data - skb->head; + long offset = skb->data - skb->head; + + DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->transport_header))offset); + skb->transport_header = offset; } static inline void skb_set_transport_header(struct sk_buff *skb, -- cgit v1.2.3 From 305ae87dafc1a9f35374a783119d9e6001d1b8e0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2024 17:44:02 +0000 Subject: net: add debug check in skb_reset_network_header() Make sure (skb->data - skb->head) can fit in skb->network_header This needs CONFIG_DEBUG_NET=y. Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20241105174403.850330-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f76524844e7e..f1e49a32880d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3027,7 +3027,10 @@ static inline unsigned char *skb_network_header(const struct sk_buff *skb) static inline void skb_reset_network_header(struct sk_buff *skb) { - skb->network_header = skb->data - skb->head; + long offset = skb->data - skb->head; + + DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->network_header))offset); + skb->network_header = offset; } static inline void skb_set_network_header(struct sk_buff *skb, const int offset) -- cgit v1.2.3 From 3b6167e9bfc9eae4edad065c166c667f9a8e693a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2024 17:44:03 +0000 Subject: net: add debug check in skb_reset_mac_header() Make sure (skb->data - skb->head) can fit in skb->mac_header This needs CONFIG_DEBUG_NET=y. Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20241105174403.850330-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f1e49a32880d..e5095d75abba 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3063,7 +3063,10 @@ static inline void skb_unset_mac_header(struct sk_buff *skb) static inline void skb_reset_mac_header(struct sk_buff *skb) { - skb->mac_header = skb->data - skb->head; + long offset = skb->data - skb->head; + + DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->mac_header))offset); + skb->mac_header = offset; } static inline void skb_set_mac_header(struct sk_buff *skb, const int offset) -- cgit v1.2.3 From 49a17639508c3b35f90ca829e60dddeeeb750e74 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Nov 2024 15:51:39 +0100 Subject: softirq: Use a dedicated thread for timer wakeups on PREEMPT_RT. The timer and hrtimer soft interrupts are raised in hard interrupt context. With threaded interrupts force enabled or on PREEMPT_RT this leads to waking the ksoftirqd for the processing of the soft interrupt. ksoftirqd runs as SCHED_OTHER task which means it will compete with other tasks for CPU resources. This can introduce long delays for timer processing on heavy loaded systems and is not desired. Split the TIMER_SOFTIRQ and HRTIMER_SOFTIRQ processing into a dedicated timers thread and let it run at the lowest SCHED_FIFO priority. Wake-ups for RT tasks happen from hardirq context so only timer_list timers and hrtimers for "regular" tasks are processed here. The higher priority ensures that wakeups are performed before scheduling SCHED_OTHER tasks. Using a dedicated variable to store the pending softirq bits values ensure that the timer are not accidentally picked up by ksoftirqd and other threaded interrupts. It shouldn't be picked up by ksoftirqd since it runs at lower priority. However if ksoftirqd is already running while a timer fires, then ksoftird will be PI-boosted due to the BH-lock to ktimer's priority. The timer thread can pick up pending softirqs from ksoftirqd but only if the softirq load is high. It is not be desired that the picked up softirqs are processed at SCHED_FIFO priority under high softirq load but this can already happen by a PI-boost by a force-threaded interrupt. [ frederic@kernel.org: rcutorture.c fixes, storm fix by introduction of local_timers_pending() for tick_nohz_next_event() ] [ junxiao.chang@intel.com: Ensure ktimersd gets woken up even if a softirq is currently served. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney [rcutorture] Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241106150419.2593080-4-bigeasy@linutronix.de --- include/linux/interrupt.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 457151f9f263..8cd9327e4e78 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -616,6 +616,53 @@ extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); +/* + * With forced-threaded interrupts enabled a raised softirq is deferred to + * ksoftirqd unless it can be handled within the threaded interrupt. This + * affects timer_list timers and hrtimers which are explicitly marked with + * HRTIMER_MODE_SOFT. + * With PREEMPT_RT enabled more hrtimers are moved to softirq for processing + * which includes all timers which are not explicitly marked HRTIMER_MODE_HARD. + * Userspace controlled timers (like the clock_nanosleep() interface) is divided + * into two categories: Tasks with elevated scheduling policy including + * SCHED_{FIFO|RR|DL} and the remaining scheduling policy. The tasks with the + * elevated scheduling policy are woken up directly from the HARDIRQ while all + * other wake ups are delayed to softirq and so to ksoftirqd. + * + * The ksoftirqd runs at SCHED_OTHER policy at which it should remain since it + * handles the softirq in an overloaded situation (not handled everything + * within its last run). + * If the timers are handled at SCHED_OTHER priority then they competes with all + * other SCHED_OTHER tasks for CPU resources are possibly delayed. + * Moving timers softirqs to a low priority SCHED_FIFO thread instead ensures + * that timer are performed before scheduling any SCHED_OTHER thread. + */ +DECLARE_PER_CPU(struct task_struct *, ktimerd); +DECLARE_PER_CPU(unsigned long, pending_timer_softirq); +void raise_ktimers_thread(unsigned int nr); + +static inline unsigned int local_timers_pending_force_th(void) +{ + return __this_cpu_read(pending_timer_softirq); +} + +static inline void raise_timer_softirq(unsigned int nr) +{ + lockdep_assert_in_irq(); + if (force_irqthreads()) + raise_ktimers_thread(nr); + else + __raise_softirq_irqoff(nr); +} + +static inline unsigned int local_timers_pending(void) +{ + if (force_irqthreads()) + return local_timers_pending_force_th(); + else + return local_softirq_pending(); +} + DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) -- cgit v1.2.3 From fbf920f255315974808ce91d934fe50198294d51 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:15 +0100 Subject: hrtimers: Add missing hrtimer_init() trace points hrtimer_init*_on_stack() is not covered by tracing when CONFIG_DEBUG_OBJECTS_TIMERS=y. Rework the functions similar to hrtimer_init() and hrtimer_init_sleeper() so that the hrtimer_init() tracepoint is unconditionally available. The rework makes hrtimer_init_sleeper() unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/74528e8abf2bb96e8bee85ffacbf14e15cf89f0d.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index aa1e65ccb615..5aa9d57528c4 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -228,32 +228,15 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); -extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, - enum hrtimer_mode mode); - -#ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_hrtimer_on_stack(struct hrtimer *timer); #else -static inline void hrtimer_init_on_stack(struct hrtimer *timer, - clockid_t which_clock, - enum hrtimer_mode mode) -{ - hrtimer_init(timer, which_clock, mode); -} - -static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, - enum hrtimer_mode mode) -{ - hrtimer_init_sleeper(sl, clock_id, mode); -} - static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif -- cgit v1.2.3 From 908a1d775422ba2e27a5e33d0c130b522419e121 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:20 +0100 Subject: hrtimers: Introduce hrtimer_setup() to replace hrtimer_init() To initialize hrtimer, hrtimer_init() needs to be called and also hrtimer::function must be set. This is error-prone and awkward to use. Introduce hrtimer_setup() which does both of these things, so that users of hrtimer can be simplified. The new setup function also has a sanity check for the provided function pointer. If NULL, a warning is emitted and a dummy callback installed. hrtimer_init() will be removed as soon as all of its users have been converted to the new function. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/5057c1ddbfd4b92033cd93d37fe38e6b069d5ba6.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5aa9d57528c4..bcc0715c59a8 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -228,6 +228,8 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); +extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, -- cgit v1.2.3 From 444cb7db4c9f9b5d96be17c38b3e989df7bfabd5 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:21 +0100 Subject: hrtimers: Introduce hrtimer_setup_on_stack() To initialize hrtimer on stack, hrtimer_init_on_stack() needs to be called and also hrtimer::function must be set. This is error-prone and awkward to use. Introduce hrtimer_setup_on_stack() which does both of these things, so that users of hrtimer can be simplified. The new setup function also has a sanity check for the provided function pointer. If NULL, a warning is emitted and a dummy callback installed. hrtimer_init_on_stack() will be removed as soon as all of its users have been converted to the new function. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/4b05e2ab3a82c517adf67fabc0f0cd8fe118b97c.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bcc0715c59a8..2da513f8d66a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -232,6 +232,9 @@ extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); +extern void hrtimer_setup_on_stack(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); -- cgit v1.2.3 From c9bd83abfeb9a9b103e689b251ccff7a01be8366 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:22 +0100 Subject: hrtimers: Introduce hrtimer_setup_sleeper_on_stack() The hrtimer_init*() API is replaced by hrtimer_setup*() variants to initialize the timer including the callback function at once. hrtimer_init_sleeper_on_stack() does not need user to setup the callback function separately, so a new variant would not be strictly necessary. Nonetheless, to keep the naming convention consistent, introduce hrtimer_setup_sleeper_on_stack(). hrtimer_init_on_stack() will be removed once all users are converted. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/7b5e18e6dd0ace9eaa211201528cb9dc23752454.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2da513f8d66a..48872a2b4071 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -238,6 +238,8 @@ extern void hrtimer_setup_on_stack(struct hrtimer *timer, extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); +extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_hrtimer_on_stack(struct hrtimer *timer); -- cgit v1.2.3 From 8f02e3563bb5824eb01c94f2c75f1dcee2d05625 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:23 +0100 Subject: hrtimers: Introduce hrtimer_update_function() Some users of hrtimer need to change the callback function after the initial setup. They write to hrtimer::function directly. That's not safe under all circumstances as the write is lockless and a concurrent timer expiry might end up using the wrong function pointer. Introduce hrtimer_update_function(), which also performs runtime checks whether it is safe to modify the callback. This allows to make hrtimer::function private once all users are converted. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20a937b0ae09ad54b5b6d86eabead7c570f1b72e.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 48872a2b4071..6e026730e803 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -327,6 +327,28 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) return timer->base->running == timer; } +/** + * hrtimer_update_function - Update the timer's callback function + * @timer: Timer to update + * @function: New callback function + * + * Only safe to call if the timer is not enqueued. Can be called in the callback function if the + * timer is not enqueued at the same time (see the comments above HRTIMER_STATE_ENQUEUED). + */ +static inline void hrtimer_update_function(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *)) +{ + guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock); + + if (WARN_ON_ONCE(hrtimer_is_queued(timer))) + return; + + if (WARN_ON_ONCE(!function)) + return; + + timer->function = function; +} + /* Forward a hrtimer so it expires after now: */ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); -- cgit v1.2.3 From 211647e5121e0e0da974bf69a8eb7c9fe57fa3bd Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:28 +0100 Subject: wait: Switch to use hrtimer_setup_sleeper_on_stack() hrtimer_setup_sleeper_on_stack() replaces hrtimer_init_sleeper_on_stack() to keep the naming convention consistent. Convert the usage site over to it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/fc91182375df81120a88dbe0263267e24d1bf19e.1730386209.git.namcao@linutronix.de --- include/linux/wait.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 8aa3372f21a0..643b7c7bf376 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -541,8 +541,8 @@ do { \ int __ret = 0; \ struct hrtimer_sleeper __t; \ \ - hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ - HRTIMER_MODE_REL); \ + hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ + HRTIMER_MODE_REL); \ if ((timeout) != KTIME_MAX) { \ hrtimer_set_expires_range_ns(&__t.timer, timeout, \ current->timer_slack_ns); \ -- cgit v1.2.3 From f3bef7aaa6c807b78e8fc6929c3226d3038fe505 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:29 +0100 Subject: hrtimers: Delete hrtimer_init_sleeper_on_stack() hrtimer_init_sleeper_on_stack() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/52549846635c0b3a2abf82101f539efdabcd9778.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 6e026730e803..4e4f04b3c0c2 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -235,9 +235,6 @@ extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, extern void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); -extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, - enum hrtimer_mode mode); extern void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); -- cgit v1.2.3 From 3c2fb0152175f9f596b40763cdc1378297da60af Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:33 +0100 Subject: hrtimers: Delete hrtimer_init_on_stack() hrtimer_init_on_stack() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/510ce0d2944c4a382ea51e51d03dcfb73ba0f4f7.1730386209.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4e4f04b3c0c2..7ef5f7ef31a9 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -230,8 +230,6 @@ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); -extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, - enum hrtimer_mode mode); extern void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); -- cgit v1.2.3 From b33cc96c7020b923085046e5cf2e934f41c530ec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 16:25:30 +0100 Subject: mm: add PageAnonNotKsm() Check that this anonymous page is really anonymous, not anonymous-or-KSM. This optimises the debug check, but its real purpose is to remove the last two users of PageKsm(). [willy@infradead.org: fix assertions] Link: https://lkml.kernel.org/r/ZwApWPER7caIA_N3@casper.infradead.org Link: https://lkml.kernel.org/r/20241002152533.1350629-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Alex Shi Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cc839e4365c1..1fcef06a2d31 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -689,6 +689,13 @@ static __always_inline bool folio_test_anon(const struct folio *folio) return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; } +static __always_inline bool PageAnonNotKsm(const struct page *page) +{ + unsigned long flags = (unsigned long)page_folio(page)->mapping; + + return (flags & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_ANON; +} + static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); @@ -1137,14 +1144,14 @@ static __always_inline int PageAnonExclusive(const struct page *page) static __always_inline void SetPageAnonExclusive(struct page *page) { - VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); + VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } static __always_inline void ClearPageAnonExclusive(struct page *page) { - VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); + VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } -- cgit v1.2.3 From b9a256352f3ba697396c26d2a74f4081335f8cef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 16:25:31 +0100 Subject: mm: remove PageKsm() All callers have been converted to use folio_test_ksm() or PageAnonNotKsm(), so we can remove this wrapper. Link: https://lkml.kernel.org/r/20241002152533.1350629-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Alex Shi Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1fcef06a2d31..e80665bc51fa 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -725,13 +725,8 @@ static __always_inline bool folio_test_ksm(const struct folio *folio) return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } - -static __always_inline bool PageKsm(const struct page *page) -{ - return folio_test_ksm(page_folio(page)); -} #else -TESTPAGEFLAG_FALSE(Ksm, ksm) +FOLIO_TEST_FLAG_FALSE(ksm) #endif u64 stable_page_flags(const struct page *page); -- cgit v1.2.3 From d7d65b1039019e8789119b498d97cf2531d989a8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 3 Oct 2024 10:18:42 +0530 Subject: mm: move set_pxd_safe() helpers from generic to platform set_pxd_safe() helpers that serve a specific purpose for both x86 and riscv platforms, do not need to be in the common memory code. Otherwise they just unnecessarily make the common API more complicated. This moves the helpers from common code to platform instead. Link: https://lkml.kernel.org/r/20241003044842.246016-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: David Hildenbrand Acked-by: Dave Hansen Acked-by: David Hildenbrand Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Thomas Gleixner Cc: David Hildenbrand Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 38 -------------------------------------- 1 file changed, 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e8b2ac6bd2ae..23aeffd89a4e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1056,44 +1056,6 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) } #endif -/* - * Use set_p*_safe(), and elide TLB flushing, when confident that *no* - * TLB flush will be required as a result of the "set". For example, use - * in scenarios where it is known ahead of time that the routine is - * setting non-present entries, or re-setting an existing entry to the - * same value. Otherwise, use the typical "set" helpers and flush the - * TLB. - */ -#define set_pte_safe(ptep, pte) \ -({ \ - WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ - set_pte(ptep, pte); \ -}) - -#define set_pmd_safe(pmdp, pmd) \ -({ \ - WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ - set_pmd(pmdp, pmd); \ -}) - -#define set_pud_safe(pudp, pud) \ -({ \ - WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ - set_pud(pudp, pud); \ -}) - -#define set_p4d_safe(p4dp, p4d) \ -({ \ - WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ - set_p4d(p4dp, p4d); \ -}) - -#define set_pgd_safe(pgdp, pgd) \ -({ \ - WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ - set_pgd(pgdp, pgd); \ -}) - #ifndef __HAVE_ARCH_DO_SWAP_PAGE static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, -- cgit v1.2.3 From 7f24cbc9c4d42db8a3c8484d120cf9c1da557fab Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:29 +0200 Subject: mm/mmap: teach generic_get_unmapped_area{_topdown} to handle hugetlb mappings Patch series "Unify hugetlb into arch_get_unmapped_area functions", v4. This is an attempt to get rid of a fair amount of duplicated code wrt. hugetlb and *get_unmapped_area* functions. HugeTLB registers a .get_unmapped_area function which gets called from __get_unmapped_area(). hugetlb_get_unmapped_area() is defined by a bunch of architectures and it also has a generic definition for those that do not define it. Short-long story is that there is a ton of duplicated code between specific hugetlb *_get_unmapped_area_* functions and mm-core functions, so we can do better by teaching arch_get_unmapped_area* functions how to deal with hugetlb mappings. Note that not a lot of things need to be taught though. hugetlb_get_unmapped_area, that gets called for hugetlb mappings, runs some sanity checks prior to calling mm_get_unmapped_area_vmflags(), so we do not need to that down the road in the respective {generic,arch}_get_unmapped_area* functions. More information can be found in the respective patches. LTP mmapstress hugetlb selftests were ran succesfully on: This patch (of 9): We want to stop special casing hugetlb mappings and make them go through generic channels, so teach generic_get_unmapped_area{_topdown} to handle those. The main difference is that we set info.align_mask for huge mappings. Link: https://lkml.kernel.org/r/20241007075037.267650-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20241007075037.267650-2-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e4697539b665..368d552e4860 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1035,9 +1035,19 @@ void hugetlb_unregister_node(struct node *node); */ bool is_raw_hwpoison_page_in_hugepage(struct page *page); +static inline unsigned long huge_page_mask_align(struct file *file) +{ + return PAGE_MASK & ~huge_page_mask(hstate_file(file)); +} + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline unsigned long huge_page_mask_align(struct file *file) +{ + return 0; +} + static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return NULL; -- cgit v1.2.3 From 7bd3f1e1a9ae7f7508c88dd056755a0a4741ae88 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:34 +0200 Subject: mm: make hugetlb mappings go through mm_get_unmapped_area_vmflags Hugetlb mappings will no longer be special cased but rather go through the generic mm_get_unmapped_area_vmflags function. For that to happen, let us remove the .get_unmapped_area from hugetlbfs_file_operations struct, and hint __get_unmapped_area that it should not send hugetlb mappings through thp_get_unmapped_area_vmflags but through mm_get_unmapped_area_vmflags. Create also a function called hugetlb_mmap_check_and_align() where a couple of safety checks are being done and the addr is aligned to the huge page size. Otherwise we will have to do this in every single function, which duplicates quite a lot of code. Link: https://lkml.kernel.org/r/20241007075037.267650-7-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 368d552e4860..3a81b6126f62 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -546,11 +546,10 @@ static inline struct hstate *hstate_inode(struct inode *i) } #endif /* !CONFIG_HUGETLBFS */ -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); -#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ +unsigned long +__generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); unsigned long generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, -- cgit v1.2.3 From cc92882ee218d62ef017fa545b3c8a2d1e060a5a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:35 +0200 Subject: mm: drop hugetlb_get_unmapped_area{_*} functions Hugetlb mappings are now handled through normal channels just like any other mapping, so we no longer need hugetlb_get_unmapped_area* specific functions. Link: https://lkml.kernel.org/r/20241007075037.267650-8-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3a81b6126f62..ae4fe8615bb6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -547,15 +547,10 @@ static inline struct hstate *hstate_inode(struct inode *i) #endif /* !CONFIG_HUGETLBFS */ unsigned long -__generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -unsigned long -generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); - /* * huegtlb page specific state flags. These flags are located in page.private * of the hugetlb head page. Functions created via the below macros should be -- cgit v1.2.3 From afe789b7367ad43ba8f079981d40851f8bd319ce Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 8 Oct 2024 19:50:24 -0700 Subject: kaslr: rename physmem_end and PHYSMEM_END to direct_map_physmem_end For clarity. It's increasingly hard to reason about the code, when KASLR is moving around the boundaries. In this case where KASLR is randomizing the location of the kernel image within physical memory, the maximum number of address bits for physical memory has not changed. What has changed is the ending address of memory that is allowed to be directly mapped by the kernel. Let's name the variable, and the associated macro accordingly. Also, enhance the comment above the direct_map_physmem_end definition, to further clarify how this all works. Link: https://lkml.kernel.org/r/20241009025024.89813-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Pankaj Gupta Acked-by: David Hildenbrand Acked-by: Will Deacon Reviewed-by: Mike Rapoport (Microsoft) Cc: Thomas Gleixner Cc: Alistair Popple Cc: Jordan Niethe Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f5394d75ce2..4570f33e2429 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -97,11 +97,11 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif -#ifndef PHYSMEM_END +#ifndef DIRECT_MAP_PHYSMEM_END # ifdef MAX_PHYSMEM_BITS -# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +# define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else -# define PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) +# define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif -- cgit v1.2.3 From 6359c39c9de66dede8ff5ff257c9e117483dbc7c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 10 Oct 2024 14:15:56 +0800 Subject: mm: remove unused hugepage for vma_alloc_folio() The hugepage parameter was deprecated since commit ddc1a5cbc05d ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"), for PMD-sized THP, it still tries only preferred node if possible in vma_alloc_folio() by checking the order of the folio allocation. Link: https://lkml.kernel.org/r/20241010061556.1846751-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/gfp.h | 6 +++--- include/linux/highmem.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a951de920e20..b65724c3427d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -306,7 +306,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage); + unsigned long addr); #else static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { @@ -326,7 +326,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde { return folio_alloc_noprof(gfp, order); } -#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ +#define vma_alloc_folio_noprof(gfp, order, vma, addr) \ folio_alloc_noprof(gfp, order) #endif @@ -341,7 +341,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false); + struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr); return &folio->page; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 930a591b9b61..bec9bd715acf 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -226,7 +226,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, { struct folio *folio; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio) clear_user_highpage(&folio->page, vaddr); -- cgit v1.2.3 From 5708d96da20b99b4665ad72395e3727016057f70 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 11 Oct 2024 11:03:04 -0400 Subject: mm: avoid zeroing user movable page twice with init_on_alloc=1 Commit 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options") forces allocated page to be zeroed in post_alloc_hook() when init_on_alloc=1. For order-0 folios, if arch does not define vma_alloc_zeroed_movable_folio(), the default implementation again zeros the page return from the buddy allocator. So the page is zeroed twice. Fix it by passing __GFP_ZERO instead to avoid double page zeroing. At the moment, s390,arm64,x86,alpha,m68k are not impacted since they define their own vma_alloc_zeroed_movable_folio(). For >0 order folios (mTHP and PMD THP), folio_zero_user() is called to zero the folio again. Fix it by calling folio_zero_user() only if init_on_alloc is set. All arch are impacted. Add alloc_zeroed() helper to encapsulate the init_on_alloc check. [ziy@nvidia.com: comment fixes, per David] Link: https://lkml.kernel.org/r/97DB52E1-C594-49B5-9736-89AC302FAB01@nvidia.com Link: https://lkml.kernel.org/r/20241011150304.709590-1-ziy@nvidia.com Signed-off-by: Zi Yan Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Alexander Potapenko Cc: "Huang, Ying" Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index bec9bd715acf..6e452bd8e7e3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,13 +224,7 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - struct folio *folio; - - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); - if (folio) - clear_user_highpage(&folio->page, vaddr); - - return folio; + return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); } #endif -- cgit v1.2.3 From f1001f3d3b6868998cab73d10fda1a5c99ddf963 Mon Sep 17 00:00:00 2001 From: Wei Xu Date: Thu, 17 Oct 2024 18:15:28 +0000 Subject: mm/mglru: reset page lru tier bits when activating When a folio is activated, lru_gen_add_folio() moves the folio to the youngest generation. But unlike folio_update_gen()/folio_inc_gen(), lru_gen_add_folio() doesn't reset the folio lru tier bits (LRU_REFS_MASK | LRU_REFS_FLAGS). This inconsistency can affect how pages are aged via folio_mark_accessed() (e.g. fd accesses), though no user visible impact related to this has been detected yet. Note that lru_gen_add_folio() cannot clear PG_workingset if the activation is due to workingset refault, otherwise PSI accounting will be skipped. So fix lru_gen_add_folio() to clear the lru tier bits other than PG_workingset when activating a folio, and also clear all the lru tier bits when a folio is activated via folio_activate() in lru_gen_look_around(). Link: https://lkml.kernel.org/r/20241017181528.3358821-1-weixugc@google.com Fixes: 018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap") Signed-off-by: Wei Xu Cc: Axel Rasmussen Cc: Brian Geffon Cc: Jan Alexander Steffens Cc: Suleiman Souhlal Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 15 ++++++++++++++- include/linux/mmzone.h | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 6f801c7b36e2..355cf46a01a6 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -155,6 +155,11 @@ static inline int folio_lru_refs(struct folio *folio) return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; } +static inline void folio_clear_lru_refs(struct folio *folio) +{ + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); +} + static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); @@ -222,6 +227,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, { unsigned long seq; unsigned long flags; + unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -257,7 +263,14 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + mask = LRU_GEN_MASK; + /* + * Don't clear PG_workingset here because it can affect PSI accounting + * if the activation is due to workingset refault. + */ + if (folio_test_active(folio)) + mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); + set_mask_bits(&folio->flags, mask, flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5b1c984daf45..2e8c4307c728 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -403,6 +403,8 @@ enum { NR_LRU_GEN_CAPS }; +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) -- cgit v1.2.3 From 9884efd795cc2f71ef3b7f42df32420b0b7ce34f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 17 Oct 2024 22:14:56 +0800 Subject: mm: huge_memory: move file_thp_enabled() into huge_memory.c file_thp_enabled() is only used in __thp_vma_allowable_orders(), so move it into huge_memory.c, also check READ_ONLY_THP_FOR_FS ahead to avoid unnecessary code if config disabled. Link: https://lkml.kernel.org/r/20241017141457.1169092-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8afe09a2cf03..006f730545c2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -252,19 +252,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, return orders; } -static inline bool file_thp_enabled(struct vm_area_struct *vma) -{ - struct inode *inode; - - if (!vma->vm_file) - return false; - - inode = vma->vm_file->f_inode; - - return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && - !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); -} - unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, -- cgit v1.2.3 From b7f058f827392022d8c689329f88c7b324d71dad Mon Sep 17 00:00:00 2001 From: Luoxi Li Date: Fri, 18 Oct 2024 17:22:35 +0800 Subject: mm: remove unused has_isolate_pageblock has_isolate_pageblock() has been unused since commit 55612e80e722 ("mm: page_alloc: close migratetype race between freeing and stealing") Remove it. Link: https://lkml.kernel.org/r/20241018092235.2764859-1-kaixa@kiloview.com Signed-off-by: Luoxi Li Acked-by: David Hildenbrand Reviewed-by: Muhammad Usama Anjum Acked-by: Johannes Weiner Reviewed-by: Anshuman Khandual Cc: Baolin Wang Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index c16db0067090..73dc2c1841ec 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -3,10 +3,6 @@ #define __LINUX_PAGEISOLATION_H #ifdef CONFIG_MEMORY_ISOLATION -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return zone->nr_isolate_pageblock; -} static inline bool is_migrate_isolate_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; @@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype) return migratetype == MIGRATE_ISOLATE; } #else -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return false; -} static inline bool is_migrate_isolate_page(struct page *page) { return false; -- cgit v1.2.3 From 78c018e3942c5dfbab7e6edb4eb784943878504b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 7 Oct 2024 23:47:45 +0200 Subject: maple_tree: fix outdated flag name in comment MAPLE_USE_RCU was renamed to MT_FLAGS_USE_RCU at some point, fix up the comment. Link: https://lkml.kernel.org/r/20241007-maple-tree-doc-fix-v1-1-6bbf89c1153d@google.com Signed-off-by: Jann Horn Reviewed-by: Liam R. Howlett Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index c2c11004085e..61c236850ca8 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -224,7 +224,7 @@ typedef struct { /* nothing */ } lockdep_map_p; * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the - * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in + * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ -- cgit v1.2.3 From ed265529d39ac408396c031a4fd7e1ef922b80d0 Mon Sep 17 00:00:00 2001 From: Sourav Panda Date: Tue, 22 Oct 2024 23:24:40 +0000 Subject: mm/codetag: fix arg in pgalloc_tag_copy alloc_tag_sub alloc_tag_sub() takes bytes as opposed to number of pages as argument. Currently pgalloc_tag_copy() passes the number of pages. This fix passes the correct unit, which is the number of bytes allocated. Link: https://lkml.kernel.org/r/20241022232440.334820-1-souravpanda@google.com Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()") Signed-off-by: Sourav Panda Acked-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Cc: Wei Xu Cc: Yu Zhao Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4570f33e2429..eb070c14e309 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4207,7 +4207,7 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) /* Clear the old ref to the original allocation tag. */ clear_page_tag_ref(&old->page); /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(ref, folio_nr_pages(new)); + alloc_tag_sub(ref, folio_size(new)); __alloc_tag_ref_set(ref, tag); -- cgit v1.2.3 From 628e1b8c4777941e119effc92cd395b4b02c2c5f Mon Sep 17 00:00:00 2001 From: James Houghton Date: Mon, 21 Oct 2024 16:02:12 +0000 Subject: mm: add missing mmu_notifier_clear_young for !MMU_NOTIFIER Remove the now unnecessary ifdef in mm/damon/vaddr.c as well. Link: https://lkml.kernel.org/r/20241021160212.9935-1-jthoughton@google.com Signed-off-by: James Houghton Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Reviewed-by: SeongJae Park Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d39ebb10caeb..e2dd57ca368b 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -606,6 +606,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + return 0; +} + static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { -- cgit v1.2.3 From 6b611388b626eaa59d202bf8f64d095ff80bcde6 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 24 Oct 2024 18:22:59 -0700 Subject: memcg-v1: remove charge move code The memcg-v1 charge move feature has been deprecated completely and let's remove the relevant code as well. Link: https://lkml.kernel.org/r/20241025012304.2473312-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Muchun Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 34d2da05f2f1..0b113267b2de 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -299,11 +299,6 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; /* taken only while moving_account > 0 */ spinlock_t move_lock; unsigned long move_lock_flags; -- cgit v1.2.3 From a29c0e4b2e867f4e362a6740c430bfdc2efdd1d9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 24 Oct 2024 18:23:03 -0700 Subject: memcg-v1: remove memcg move locking code The memcg v1's charge move feature has been deprecated. All the places using the memcg move lock, have stopped using it as they don't need the protection any more. Let's proceed to remove all the locking code related to charge moving. Link: https://lkml.kernel.org/r/20241025012304.2473312-7-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Muchun Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 54 ---------------------------------------------- 1 file changed, 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0b113267b2de..bb49e0d4b377 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -299,20 +299,10 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; - unsigned long move_lock_flags; - /* Legacy tcp memory accounting */ bool tcpmem_active; int tcpmem_pressure; - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - struct task_struct *move_lock_task; - /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -428,9 +418,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * * - the folio lock * - LRU isolation - * - folio_memcg_lock() * - exclusive reference - * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. @@ -499,9 +487,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) * * - the folio lock * - LRU isolation - * - lock_folio_memcg() * - exclusive reference - * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. @@ -1867,26 +1853,6 @@ static inline bool task_in_memcg_oom(struct task_struct *p) return p->memcg_in_oom; } -void folio_memcg_lock(struct folio *folio); -void folio_memcg_unlock(struct folio *folio); - -/* try to stablize folio_memcg() for all the pages in a memcg */ -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - rcu_read_lock(); - - if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) - return true; - - rcu_read_unlock(); - return false; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); @@ -1908,26 +1874,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, return 0; } -static inline void folio_memcg_lock(struct folio *folio) -{ -} - -static inline void folio_memcg_unlock(struct folio *folio) -{ -} - -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - /* to match folio_memcg_rcu() */ - rcu_read_lock(); - return true; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - static inline bool task_in_memcg_oom(struct task_struct *p) { return false; -- cgit v1.2.3 From 906c38ff52e95575ddf3281bee531eded3dba150 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sat, 26 Oct 2024 09:37:07 -0700 Subject: memcg: workingset: remove folio_memcg_rcu usage The function workingset_activation() is called from folio_mark_accessed() with the guarantee that the given folio can not be freed under us in workingset_activation(). In addition, the association of the folio and its memcg can not be broken here because charge migration is no more. There is no need to use folio_memcg_rcu. Simply use folio_memcg_charged() because that is what this function cares about. [akpm@linux-foundation.org: provide folio_memcg_charged stub for CONFIG_MEMCG=n] Link: https://lkml.kernel.org/r/20241026163707.2479526-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Suggested-by: Yu Zhao Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Muchun Song Cc: Hugh Dickins Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bb49e0d4b377..8e4608be811d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -443,35 +443,6 @@ static inline bool folio_memcg_charged(struct folio *folio) return __folio_memcg(folio) != NULL; } -/** - * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. - * @folio: Pointer to the folio. - * - * This function assumes that the folio is known to have a - * proper memory cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios. - * - * Return: A pointer to the memory cgroup associated with the folio, - * or NULL. - */ -static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) -{ - unsigned long memcg_data = READ_ONCE(folio->memcg_data); - - VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - - if (memcg_data & MEMCG_DATA_KMEM) { - struct obj_cgroup *objcg; - - objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return obj_cgroup_memcg(objcg); - } - - WARN_ON_ONCE(!rcu_read_lock_held()); - - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); -} - /* * folio_memcg_check - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -1084,10 +1055,9 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return NULL; } -static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) +static inline bool folio_memcg_charged(struct folio *folio) { - WARN_ON_ONCE(!rcu_read_lock_held()); - return NULL; + return false; } static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) -- cgit v1.2.3 From 495e7c8e9601245738a6ab53c992cc5aa0b13cb4 Mon Sep 17 00:00:00 2001 From: Jinjian Song Date: Mon, 4 Nov 2024 17:44:34 +0800 Subject: wwan: core: Add WWAN ADB and MIPC port type Add new WWAN ports that connect to the device's ADB protocol interface and MTK MIPC diagnostic interface. Signed-off-by: Jinjian Song Reviewed-by: Sergey Ryazanov Signed-off-by: Paolo Abeni --- include/linux/wwan.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 170fdee6339c..79c781875c09 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -17,6 +17,8 @@ * @WWAN_PORT_FIREHOSE: XML based command protocol * @WWAN_PORT_XMMRPC: Control protocol for Intel XMM modems * @WWAN_PORT_FASTBOOT: Fastboot protocol control + * @WWAN_PORT_ADB: ADB protocol control + * @WWAN_PORT_MIPC: MTK MIPC diagnostic interface * * @WWAN_PORT_MAX: Highest supported port types * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type @@ -30,6 +32,8 @@ enum wwan_port_type { WWAN_PORT_FIREHOSE, WWAN_PORT_XMMRPC, WWAN_PORT_FASTBOOT, + WWAN_PORT_ADB, + WWAN_PORT_MIPC, /* Add new port types above this line */ -- cgit v1.2.3 From ab9bc81c1cf0efc7fc5a3aa4e562aa88d09ada57 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Nov 2024 05:45:34 -0700 Subject: Revert "block: pre-calculate max_zone_append_sectors" This causes issue on, at least, nvme-mpath where my boot fails with: WARNING: CPU: 354 PID: 2729 at block/blk-settings.c:75 blk_validate_limits+0x356/0x380 Modules linked in: tg3(+) nvme usbcore scsi_mod ptp i2c_piix4 libphy nvme_core crc32c_intel scsi_common usb_common pps_core i2c_smbus CPU: 354 UID: 0 PID: 2729 Comm: kworker/u2061:1 Not tainted 6.12.0-rc6+ #181 Hardware name: Dell Inc. PowerEdge R7625/06444F, BIOS 1.8.3 04/02/2024 Workqueue: async async_run_entry_fn RIP: 0010:blk_validate_limits+0x356/0x380 Code: f6 47 01 04 75 28 83 bf 94 00 00 00 00 75 39 83 bf 98 00 00 00 00 75 34 83 7f 68 00 75 32 31 c0 83 7f 5c 00 0f 84 9b fd ff ff <0f> 0b eb 13 0f 0b eb 0f 48 c7 c0 74 12 58 92 48 89 c7 e8 13 76 46 RSP: 0018:ffffa8a1dfb93b30 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff9232829c8388 RCX: 0000000000000088 RDX: 0000000000000080 RSI: 0000000000000200 RDI: ffffa8a1dfb93c38 RBP: 000000000000000c R08: 00000000ffffffff R09: 000000000000ffff R10: 0000000000000000 R11: 0000000000000000 R12: ffff9232829b9000 R13: ffff9232829b9010 R14: ffffa8a1dfb93c38 R15: ffffa8a1dfb93c38 FS: 0000000000000000(0000) GS:ffff923867c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055c1b92480a8 CR3: 0000002484ff0002 CR4: 0000000000370ef0 Call Trace: ? __warn+0xca/0x1a0 ? blk_validate_limits+0x356/0x380 ? report_bug+0x11a/0x1a0 ? handle_bug+0x5e/0x90 ? exc_invalid_op+0x16/0x40 ? asm_exc_invalid_op+0x16/0x20 ? blk_validate_limits+0x356/0x380 blk_alloc_queue+0x7a/0x250 __blk_alloc_disk+0x39/0x80 nvme_mpath_alloc_disk+0x13d/0x1b0 [nvme_core] nvme_scan_ns+0xcc7/0x1010 [nvme_core] async_run_entry_fn+0x27/0x120 process_scheduled_works+0x1a0/0x360 worker_thread+0x2bc/0x350 ? pr_cont_work+0x1b0/0x1b0 kthread+0x111/0x120 ? kthread_unuse_mm+0x90/0x90 ret_from_fork+0x30/0x40 ? kthread_unuse_mm+0x90/0x90 ret_from_fork_asm+0x11/0x20 ---[ end trace 0000000000000000 ]--- presumably due to max_zone_append_sectors not being cleared to zero, resulting in blk_validate_zoned_limits() complaining and failing. This reverts commit 2a8f6153e1c2db06a537a5c9d61102eb591776f1. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6d1413bd69a5..7bfc877e159e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -375,7 +375,6 @@ struct queue_limits { unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; - unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -1205,9 +1204,25 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } +static inline unsigned int +queue_limits_max_zone_append_sectors(const struct queue_limits *l) +{ + unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); + + return min_not_zero(l->max_zone_append_sectors, max_sectors); +} + +static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) +{ + if (!blk_queue_is_zoned(q)) + return 0; + + return queue_limits_max_zone_append_sectors(&q->limits); +} + static inline bool queue_emulates_zone_append(struct request_queue *q) { - return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; + return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; } static inline bool bdev_emulates_zone_append(struct block_device *bdev) @@ -1218,7 +1233,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev) static inline unsigned int bdev_max_zone_append_sectors(struct block_device *bdev) { - return bdev_limits(bdev)->max_zone_append_sectors; + return queue_max_zone_append_sectors(bdev_get_queue(bdev)); } static inline unsigned int bdev_max_segments(struct block_device *bdev) -- cgit v1.2.3 From 1900e1a4495b7376cb9b4e58f1d846660f4c9c4b Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 6 Nov 2024 15:34:45 +0800 Subject: nvme: add reservation command's defines This is a preparation patch for NVMeOF target reservation commands implantation. Add the defines of reservation command, such as reservation log and sub operations. Signed-off-by: Guixin Liu Tested-by: Chaitanya Kulkarni Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index b58d9405d65e..44d048d68503 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2037,4 +2037,72 @@ struct nvme_completion { #define NVME_MINOR(ver) (((ver) >> 8) & 0xff) #define NVME_TERTIARY(ver) ((ver) & 0xff) +enum { + NVME_AEN_RESV_LOG_PAGE_AVALIABLE = 0x00, +}; + +enum { + NVME_PR_LOG_EMPTY_LOG_PAGE = 0x00, + NVME_PR_LOG_REGISTRATION_PREEMPTED = 0x01, + NVME_PR_LOG_RESERVATION_RELEASED = 0x02, + NVME_PR_LOG_RESERVATOPM_PREEMPTED = 0x03, +}; + +enum { + NVME_PR_NOTIFY_BIT_REG_PREEMPTED = 1, + NVME_PR_NOTIFY_BIT_RESV_RELEASED = 2, + NVME_PR_NOTIFY_BIT_RESV_PREEMPTED = 3, +}; + +struct nvme_pr_log { + __le64 count; + __u8 type; + __u8 nr_pages; + __u8 rsvd1[2]; + __le32 nsid; + __u8 rsvd2[48]; +}; + +struct nvmet_pr_register_data { + __le64 crkey; + __le64 nrkey; +}; + +struct nvmet_pr_acquire_data { + __le64 crkey; + __le64 prkey; +}; + +struct nvmet_pr_release_data { + __le64 crkey; +}; + +enum nvme_pr_capabilities { + NVME_PR_SUPPORT_PTPL = 1, + NVME_PR_SUPPORT_WRITE_EXCLUSIVE = 1 << 1, + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS = 1 << 2, + NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY = 1 << 3, + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY = 1 << 4, + NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS = 1 << 5, + NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS = 1 << 6, + NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF = 1 << 7, +}; + +enum nvme_pr_register_action { + NVME_PR_REGISTER_ACT_REG = 0, + NVME_PR_REGISTER_ACT_UNREG = 1, + NVME_PR_REGISTER_ACT_REPLACE = 1 << 1, +}; + +enum nvme_pr_acquire_action { + NVME_PR_ACQUIRE_ACT_ACQUIRE = 0, + NVME_PR_ACQUIRE_ACT_PREEMPT = 1, + NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT = 1 << 1, +}; + +enum nvme_pr_release_action { + NVME_PR_RELEASE_ACT_RELEASE = 0, + NVME_PR_RELEASE_ACT_CLEAR = 1, +}; + #endif /* _LINUX_NVME_H */ -- cgit v1.2.3 From beeb9220c7307fbb61a2cd6575907db52bde722f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:04 +0300 Subject: mm: vmalloc: group declarations depending on CONFIG_MMU together Patch series "x86/module: use large ROX pages for text allocations", v7. These patches add support for using large ROX pages for allocations of executable memory on x86. They address Andy's comments [1] about having executable mappings for code that was not completely formed. The approach taken is to allocate ROX memory along with writable but not executable memory and use the writable copy to perform relocations and alternatives patching. After the module text gets into its final shape, the contents of the writable memory is copied into the actual ROX location using text poking. The allocations of the ROX memory use vmalloc(VMAP_ALLOW_HUGE_MAP) to allocate PMD aligned memory, fill that memory with invalid instructions and in the end remap it as ROX. Portions of these large pages are handed out to execmem_alloc() callers without any changes to the permissions. When the memory is freed with execmem_free() it is invalidated again so that it won't contain stale instructions. The module memory allocation, x86 code dealing with relocations and alternatives patching take into account the existence of the two copies, the writable memory and the ROX memory at the actual allocated virtual address. [1] https://lore.kernel.org/all/a17c65c6-863f-4026-9c6f-a04b659e9ab4@app.fastmail.com This patch (of 8): There are a couple of declarations that depend on CONFIG_MMU in include/linux/vmalloc.h spread all over the file. Group them all together to improve code readability. No functional changes. Link: https://lkml.kernel.org/r/20241023162711.2579610-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20241023162711.2579610-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 60 ++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ad2ce7a6ab7a..27408f21e501 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -134,12 +134,6 @@ extern void vm_unmap_ram(const void *mem, unsigned int count); extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); -#ifdef CONFIG_MMU -extern unsigned long vmalloc_nr_pages(void); -#else -static inline unsigned long vmalloc_nr_pages(void) { return 0; } -#endif - extern void *vmalloc_noprof(unsigned long size) __alloc_size(1); #define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__)) @@ -266,12 +260,29 @@ static inline bool is_vm_area_hugepages(const void *addr) #endif } +/* for /proc/kcore */ +long vread_iter(struct iov_iter *iter, const char *addr, size_t count); + +/* + * Internals. Don't use.. + */ +__init void vm_area_add_early(struct vm_struct *vm); +__init void vm_area_register_early(struct vm_struct *vm, size_t align); + +int register_vmap_purge_notifier(struct notifier_block *nb); +int unregister_vmap_purge_notifier(struct notifier_block *nb); + #ifdef CONFIG_MMU +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) + +unsigned long vmalloc_nr_pages(void); + int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages); void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, unsigned long end); void vunmap_range(unsigned long addr, unsigned long end); + static inline void set_vm_flush_reset_perms(void *addr) { struct vm_struct *vm = find_vm_area(addr); @@ -279,24 +290,14 @@ static inline void set_vm_flush_reset_perms(void *addr) if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } +#else /* !CONFIG_MMU */ +#define VMALLOC_TOTAL 0UL -#else -static inline void set_vm_flush_reset_perms(void *addr) -{ -} -#endif - -/* for /proc/kcore */ -extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); - -/* - * Internals. Don't use.. - */ -extern __init void vm_area_add_early(struct vm_struct *vm); -extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); +static inline unsigned long vmalloc_nr_pages(void) { return 0; } +static inline void set_vm_flush_reset_perms(void *addr) {} +#endif /* CONFIG_MMU */ -#ifdef CONFIG_SMP -# ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && defined(CONFIG_SMP) struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align); @@ -311,22 +312,9 @@ pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } -static inline void -pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) -{ -} -# endif -#endif - -#ifdef CONFIG_MMU -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -#else -#define VMALLOC_TOTAL 0UL +static inline void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) {} #endif -int register_vmap_purge_notifier(struct notifier_block *nb); -int unregister_vmap_purge_notifier(struct notifier_block *nb); - #if defined(CONFIG_MMU) && defined(CONFIG_PRINTK) bool vmalloc_dump_obj(void *object); #else -- cgit v1.2.3 From 0c3beacf681ec897e0b36685a9b49d01f5cb2dfb Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:06 +0300 Subject: asm-generic: introduce text-patching.h Several architectures support text patching, but they name the header files that declare patching functions differently. Make all such headers consistently named text-patching.h and add an empty header in asm-generic for architectures that do not support text patching. Link: https://lkml.kernel.org/r/20241023162711.2579610-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Christoph Hellwig Acked-by: Geert Uytterhoeven # m68k Acked-by: Arnd Bergmann Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/text-patching.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 include/linux/text-patching.h (limited to 'include/linux') diff --git a/include/linux/text-patching.h b/include/linux/text-patching.h new file mode 100644 index 000000000000..ad5877ab0855 --- /dev/null +++ b/include/linux/text-patching.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TEXT_PATCHING_H +#define _LINUX_TEXT_PATCHING_H + +#include + +#ifndef text_poke_copy +static inline void *text_poke_copy(void *dst, const void *src, size_t len) +{ + return memcpy(dst, src, len); +} +#define text_poke_copy text_poke_copy +#endif + +#endif /* _LINUX_TEXT_PATCHING_H */ -- cgit v1.2.3 From 0c133b1e78cd34dd9d18da707dc6f46170e9129e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:07 +0300 Subject: module: prepare to handle ROX allocations for text In order to support ROX allocations for module text, it is necessary to handle modifications to the code, such as relocations and alternatives patching, without write access to that memory. One option is to use text patching, but this would make module loading extremely slow and will expose executable code that is not finally formed. A better way is to have memory allocated with ROX permissions contain invalid instructions and keep a writable, but not executable copy of the module text. The relocations and alternative patches would be done on the writable copy using the addresses of the ROX memory. Once the module is completely ready, the updated text will be copied to ROX memory using text patching in one go and the writable copy will be freed. Add support for that to module initialization code and provide necessary interfaces in execmem. Link: https://lkml.kernel.org/r/20241023162711.2579610-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewd-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/execmem.h | 23 +++++++++++++++++++++++ include/linux/module.h | 16 ++++++++++++++++ include/linux/moduleloader.h | 4 ++++ 3 files changed, 43 insertions(+) (limited to 'include/linux') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 32cef1144117..dfdf19f8a5e8 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -46,9 +46,11 @@ enum execmem_type { /** * enum execmem_range_flags - options for executable memory allocations * @EXECMEM_KASAN_SHADOW: allocate kasan shadow + * @EXECMEM_ROX_CACHE: allocations should use ROX cache of huge pages */ enum execmem_range_flags { EXECMEM_KASAN_SHADOW = (1 << 0), + EXECMEM_ROX_CACHE = (1 << 1), }; /** @@ -123,6 +125,27 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +/** + * execmem_update_copy - copy an update to executable memory + * @dst: destination address to update + * @src: source address containing the data + * @size: how many bytes of memory shold be copied + * + * Copy @size bytes from @src to @dst using text poking if the memory at + * @dst is read-only. + * + * Return: a pointer to @dst or NULL on error + */ +void *execmem_update_copy(void *dst, const void *src, size_t size); + +/** + * execmem_is_rox - check if execmem is read-only + * @type - the execmem type to check + * + * Return: %true if the @type is read-only, %false if it's writable + */ +bool execmem_is_rox(enum execmem_type type); + #if defined(CONFIG_EXECMEM) && !defined(CONFIG_ARCH_WANTS_EXECMEM_LATE) void execmem_init(void); #else diff --git a/include/linux/module.h b/include/linux/module.h index 88ecc5e9f523..2a9386cbdf85 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -367,6 +367,8 @@ enum mod_mem_type { struct module_memory { void *base; + void *rw_copy; + bool is_rox; unsigned int size; #ifdef CONFIG_MODULES_TREE_LOOKUP @@ -767,6 +769,15 @@ static inline bool is_livepatch_module(struct module *mod) void set_module_sig_enforced(void); +void *__module_writable_address(struct module *mod, void *loc); + +static inline void *module_writable_address(struct module *mod, void *loc) +{ + if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod) + return loc; + return __module_writable_address(mod, loc); +} + #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -874,6 +885,11 @@ static inline bool module_is_coming(struct module *mod) { return false; } + +static inline void *module_writable_address(struct module *mod, void *loc) +{ + return loc; +} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index e395461d59e5..1f5507ba5a12 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -108,6 +108,10 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); +int module_post_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *mod); + #ifdef CONFIG_MODULES void flush_module_init_free_work(void); #else -- cgit v1.2.3 From 0c6378a71574daa6cd1534ad42a956e3262756c7 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:08 +0300 Subject: arch: introduce set_direct_map_valid_noflush() Add an API that will allow updates of the direct/linear map for a set of physically contiguous pages. It will be used in the following patches. Link: https://lkml.kernel.org/r/20241023162711.2579610-6-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/set_memory.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index e7aec20fb44f..3030d9245f5a 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -34,6 +34,12 @@ static inline int set_direct_map_default_noflush(struct page *page) return 0; } +static inline int set_direct_map_valid_noflush(struct page *page, + unsigned nr, bool valid) +{ + return 0; +} + static inline bool kernel_page_present(struct page *page) { return true; -- cgit v1.2.3 From 2e45474ab14f0f17c1091c503a13ff2fe2a84486 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:10 +0300 Subject: execmem: add support for cache of large ROX pages Using large pages to map text areas reduces iTLB pressure and improves performance. Extend execmem_alloc() with an ability to use huge pages with ROX permissions as a cache for smaller allocations. To populate the cache, a writable large page is allocated from vmalloc with VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as ROX. The direct map alias of that large page is exculded from the direct map. Portions of that large page are handed out to execmem_alloc() callers without any changes to the permissions. When the memory is freed with execmem_free() it is invalidated again so that it won't contain stale instructions. An architecture has to implement execmem_fill_trapping_insns() callback and select ARCH_HAS_EXECMEM_ROX configuration option to be able to use the ROX cache. The cache is enabled on per-range basis when an architecture sets EXECMEM_ROX_CACHE flag in definition of an execmem_range. Link: https://lkml.kernel.org/r/20241023162711.2579610-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/execmem.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index dfdf19f8a5e8..1517fa196bf7 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -53,6 +53,20 @@ enum execmem_range_flags { EXECMEM_ROX_CACHE = (1 << 1), }; +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +/** + * execmem_fill_trapping_insns - set memory to contain instructions that + * will trap + * @ptr: pointer to memory to fill + * @size: size of the range to fill + * @writable: is the memory poited by @ptr is writable or ROX + * + * A hook for architecures to fill execmem ranges with invalid instructions. + * Architectures that use EXECMEM_ROX_CACHE must implement this. + */ +void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); +#endif + /** * struct execmem_range - definition of an address space suitable for code and * related data allocations -- cgit v1.2.3 From 7c8c76e446ca0079692fad44a3993cb1d7666c21 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:54 -0700 Subject: maple_tree: add mas_for_each_rev() helper Patch series "page allocation tag compression", v4. This patchset implements several improvements: 1. Gracefully handles module unloading while there are used allocations allocated from that module; 2. Provides an option to store page allocation tag references in the page flags, removing dependency on page extensions and eliminating the memory overhead from storing page allocation references (~0.2% of total system memory). This also improves page allocation performance when CONFIG_MEM_ALLOC_PROFILING is enabled by eliminating page extension lookup. Page allocation performance overhead is reduced from 41% to 5.5%. Patch #1 introduces mas_for_each_rev() helper function. Patch #2 introduces shutdown_mem_profiling() helper function to be used when disabling memory allocation profiling. Patch #3 copies module tags into virtually contiguous memory which serves two purposes: - Lets us deal with the situation when module is unloaded while there are still live allocations from that module. Since we are using a copy version of the tags we can safely unload the module. Space and gaps in this contiguous memory are managed using a maple tree. - Enables simple indexing of the tags in the later patches. Patch #4 changes the way we allocate virtually contiguous memory for module tags to reserve only vitrual area and populate physical pages only as needed at module load time. Patch #5 abstracts page allocation tag reference to simplify later changes. Patch #6 adds compression option to the sysctl.vm.mem_profiling boot parameter for storing page allocation tag references inside page flags if they fit. If the number of available page flag bits is insufficient to address all kernel allocations, memory allocation profiling gets disabled with an appropriate warning. This patch (of 6): Add mas_for_each_rev() function to iterate maple tree nodes in reverse order. Link: https://lkml.kernel.org/r/20241023170759.999909-1-surenb@google.com Link: https://lkml.kernel.org/r/20241023170759.999909-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 61c236850ca8..cbbcd18d4186 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -592,6 +592,20 @@ static __always_inline void mas_reset(struct ma_state *mas) #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. + * @__mas: Maple Tree operation state (maple_state) + * @__entry: Entry retrieved from the tree + * @__min: minimum index to retrieve from the tree + * + * When returned, mas->index and mas->last will hold the entire range for the + * entry. + * + * Note: may return the zero entry. + */ +#define mas_for_each_rev(__mas, __entry, __min) \ + while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) + #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, -- cgit v1.2.3 From 0db6f8d7820a4b788565dac8eed52bfc2c3216da Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:56 -0700 Subject: alloc_tag: load module tags into separate contiguous memory When a module gets unloaded there is a possibility that some of the allocations it made are still used and therefore the allocation tags corresponding to these allocations are still referenced. As such, the memory for these tags can't be freed. This is currently handled as an abnormal situation and module's data section is not being unloaded. To handle this situation without keeping module's data in memory, allow codetags with longer lifespan than the module to be loaded into their own separate memory. The in-use memory areas and gaps after module unloading in this separate memory are tracked using maple trees. Allocation tags arrange their separate memory so that it is virtually contiguous and that will allow simple allocation tag indexing later on in this patchset. The size of this virtually contiguous memory is set to store up to 100000 allocation tags. [surenb@google.com: fix empty codetag module section handling] Link: https://lkml.kernel.org/r/20241101000017.3856204-1-surenb@google.com [akpm@linux-foundation.org: update comment, per Dan] Link: https://lkml.kernel.org/r/20241023170759.999909-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 13 +++++++++++-- include/linux/codetag.h | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 941deffc590d..55d30543c4c7 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,6 +30,13 @@ struct alloc_tag { struct alloc_tag_counters __percpu *counters; } __aligned(8); +struct alloc_tag_module_section { + unsigned long start_addr; + unsigned long end_addr; + /* used size */ + unsigned long size; +}; + #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG #define CODETAG_EMPTY ((void *)1) @@ -54,6 +61,8 @@ static inline void set_codetag_empty(union codetag_ref *ref) {} #ifdef CONFIG_MEM_ALLOC_PROFILING +#define ALLOC_TAG_SECTION_NAME "alloc_tags" + struct codetag_bytes { struct codetag *ct; s64 bytes; @@ -76,7 +85,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #define DEFINE_ALLOC_TAG(_alloc_tag) \ static struct alloc_tag _alloc_tag __used __aligned(8) \ - __section("alloc_tags") = { \ + __section(ALLOC_TAG_SECTION_NAME) = { \ .ct = CODE_TAG_INIT, \ .counters = &_shared_alloc_tag }; @@ -85,7 +94,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #define DEFINE_ALLOC_TAG(_alloc_tag) \ static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ static struct alloc_tag _alloc_tag __used __aligned(8) \ - __section("alloc_tags") = { \ + __section(ALLOC_TAG_SECTION_NAME) = { \ .ct = CODE_TAG_INIT, \ .counters = &_alloc_tag_cntr }; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index c2a579ccd455..d10bd9810d32 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -35,8 +35,15 @@ struct codetag_type_desc { size_t tag_size; void (*module_load)(struct codetag_type *cttype, struct codetag_module *cmod); - bool (*module_unload)(struct codetag_type *cttype, + void (*module_unload)(struct codetag_type *cttype, struct codetag_module *cmod); +#ifdef CONFIG_MODULES + void (*module_replaced)(struct module *mod, struct module *new_mod); + bool (*needs_section_mem)(struct module *mod, unsigned long size); + void *(*alloc_section_mem)(struct module *mod, unsigned long size, + unsigned int prepend, unsigned long align); + void (*free_section_mem)(struct module *mod, bool used); +#endif }; struct codetag_iterator { @@ -71,11 +78,31 @@ struct codetag_type * codetag_register_type(const struct codetag_type_desc *desc); #if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) + +bool codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size); +void *codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align); +void codetag_free_module_sections(struct module *mod); +void codetag_module_replaced(struct module *mod, struct module *new_mod); void codetag_load_module(struct module *mod); -bool codetag_unload_module(struct module *mod); -#else +void codetag_unload_module(struct module *mod); + +#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ + +static inline bool +codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size) { return false; } +static inline void * +codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align) { return NULL; } +static inline void codetag_free_module_sections(struct module *mod) {} +static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {} static inline void codetag_load_module(struct module *mod) {} -static inline bool codetag_unload_module(struct module *mod) { return true; } -#endif +static inline void codetag_unload_module(struct module *mod) {} + +#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ #endif /* _LINUX_CODETAG_H */ -- cgit v1.2.3 From 0f9b685626daa2f8e19a9788625c9b624c223e45 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:57 -0700 Subject: alloc_tag: populate memory for module tags as needed The memory reserved for module tags does not need to be backed by physical pages until there are tags to store there. Change the way we reserve this memory to allocate only virtual area for the tags and populate it with physical pages as needed when we load a module. [surenb@google.com: avoid execmem_vmap() when !MMU] Link: https://lkml.kernel.org/r/20241031233611.3833002-1-surenb@google.com Link: https://lkml.kernel.org/r/20241023170759.999909-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/execmem.h | 12 ++++++++++++ include/linux/vmalloc.h | 3 +++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 1517fa196bf7..64130ae19690 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -139,6 +139,18 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +#ifdef CONFIG_MMU +/** + * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory + * @size: size of the virtual mapping in bytes + * + * Maps virtually contiguous area in the range suitable for EXECMEM_MODULE_DATA. + * + * Return: the area descriptor on success or %NULL on failure. + */ +struct vm_struct *execmem_vmap(size_t size); +#endif + /** * execmem_update_copy - copy an update to executable memory * @dst: destination address to update diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 27408f21e501..31e9ffd936e3 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -202,6 +202,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, + struct page **pages, unsigned int page_shift); + /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() -- cgit v1.2.3 From 42895a86124418d8dd29a93812bc282e569ccfee Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:58 -0700 Subject: alloc_tag: introduce pgtag_ref_handle to abstract page tag references To simplify later changes to page tag references, introduce new pgtag_ref_handle type. This allows easy replacement of page_ext as a storage of page allocation tags. Link: https://lkml.kernel.org/r/20241023170759.999909-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 25 ++++++------ include/linux/pgalloc_tag.h | 92 ++++++++++++++++++++++++++------------------- 2 files changed, 67 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb070c14e309..f9120ac6d901 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4181,37 +4181,38 @@ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new return; for (i = nr_pages; i < (1 << old_order); i += nr_pages) { - union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i)); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { + if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { /* Set new reference to point to the original tag */ - alloc_tag_ref_set(ref, tag); - put_page_tag_ref(ref); + alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) { + union pgtag_ref_handle handle; + union codetag_ref ref; struct alloc_tag *tag; - union codetag_ref *ref; tag = pgalloc_tag_get(&old->page); if (!tag) return; - ref = get_page_tag_ref(&new->page); - if (!ref) + if (!get_page_tag_ref(&new->page, &ref, &handle)) return; /* Clear the old ref to the original allocation tag. */ clear_page_tag_ref(&old->page); /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(ref, folio_size(new)); - - __alloc_tag_ref_set(ref, tag); - - put_page_tag_ref(ref); + alloc_tag_sub(&ref, folio_size(new)); + __alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } #else /* !CONFIG_MEM_ALLOC_PROFILING */ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 59a3deb792a8..b13cd3313a88 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -11,46 +11,59 @@ #include +union pgtag_ref_handle { + union codetag_ref *ref; /* reference in page extension */ +}; + extern struct page_ext_operations page_alloc_tagging_ops; -static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) +/* Should be called only if mem_alloc_profiling_enabled() */ +static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, + union pgtag_ref_handle *handle) { - return (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); -} + struct page_ext *page_ext; + union codetag_ref *tmp; -static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) -{ - return (void *)ref - page_alloc_tagging_ops.offset; + if (!page) + return false; + + page_ext = page_ext_get(page); + if (!page_ext) + return false; + + tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); + ref->ct = tmp->ct; + handle->ref = tmp; + return true; } -/* Should be called only if mem_alloc_profiling_enabled() */ -static inline union codetag_ref *get_page_tag_ref(struct page *page) +static inline void put_page_tag_ref(union pgtag_ref_handle handle) { - if (page) { - struct page_ext *page_ext = page_ext_get(page); + if (WARN_ON(!handle.ref)) + return; - if (page_ext) - return codetag_ref_from_page_ext(page_ext); - } - return NULL; + page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); } -static inline void put_page_tag_ref(union codetag_ref *ref) +static inline void update_page_tag_ref(union pgtag_ref_handle handle, + union codetag_ref *ref) { - if (WARN_ON(!ref)) + if (WARN_ON(!handle.ref || !ref)) return; - page_ext_put(page_ext_from_codetag_ref(ref)); + handle.ref->ct = ref->ct; } static inline void clear_page_tag_ref(struct page *page) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + set_codetag_empty(&ref); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -59,11 +72,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE * nr); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -71,11 +86,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - alloc_tag_sub(ref, PAGE_SIZE * nr); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub(&ref, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -85,13 +102,14 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) struct alloc_tag *tag = NULL; if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - alloc_tag_sub_check(ref); - if (ref) { - if (ref->ct) - tag = ct_to_alloc_tag(ref->ct); - put_page_tag_ref(ref); + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub_check(&ref); + if (ref.ct) + tag = ct_to_alloc_tag(ref.ct); + put_page_tag_ref(handle); } } @@ -106,8 +124,6 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) #else /* CONFIG_MEM_ALLOC_PROFILING */ -static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } -static inline void put_page_tag_ref(union codetag_ref *ref) {} static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} -- cgit v1.2.3 From 4835f747d3ed181bf2c67930fe06b2c01a5d2323 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:59 -0700 Subject: alloc_tag: support for page allocation tag compression Implement support for storing page allocation tag references directly in the page flags instead of page extensions. sysctl.vm.mem_profiling boot parameter it extended to provide a way for a user to request this mode. Enabling compression eliminates memory overhead caused by page_ext and results in better performance for page allocations. However this mode will not work if the number of available page flag bits is insufficient to address all kernel allocations. Such condition can happen during boot or when loading a module. If this condition is detected, memory allocation profiling gets disabled with an appropriate warning. By default compression mode is disabled. Link: https://lkml.kernel.org/r/20241023170759.999909-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 10 ++- include/linux/codetag.h | 3 + include/linux/page-flags-layout.h | 7 ++ include/linux/pgalloc_tag.h | 145 +++++++++++++++++++++++++++++++++----- 4 files changed, 147 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 55d30543c4c7..7c0786bdf9af 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,8 +30,16 @@ struct alloc_tag { struct alloc_tag_counters __percpu *counters; } __aligned(8); +struct alloc_tag_kernel_section { + struct alloc_tag *first_tag; + unsigned long count; +}; + struct alloc_tag_module_section { - unsigned long start_addr; + union { + unsigned long start_addr; + struct alloc_tag *first_tag; + }; unsigned long end_addr; /* used size */ unsigned long size; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index d10bd9810d32..d14dbd26b370 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -13,6 +13,9 @@ struct codetag_module; struct seq_buf; struct module; +#define CODETAG_SECTION_START_PREFIX "__start_" +#define CODETAG_SECTION_STOP_PREFIX "__stop_" + /* * An instance of this structure is created in a special ELF section at every * code location being tagged. At runtime, the special section is treated as diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 7d79818dc065..4f5c9e979bb9 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -111,5 +111,12 @@ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) +#define NR_NON_PAGEFLAG_BITS (SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + \ + LAST_CPUPID_SHIFT + KASAN_TAG_WIDTH + \ + LRU_GEN_WIDTH + LRU_REFS_WIDTH) + +#define NR_UNUSED_PAGEFLAG_BITS (BITS_PER_LONG - \ + (NR_NON_PAGEFLAG_BITS + NR_PAGEFLAGS)) + #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index b13cd3313a88..1fe63b52e5e5 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -11,29 +11,118 @@ #include +extern struct page_ext_operations page_alloc_tagging_ops; +extern unsigned long alloc_tag_ref_mask; +extern int alloc_tag_ref_offs; +extern struct alloc_tag_kernel_section kernel_tags; + +DECLARE_STATIC_KEY_FALSE(mem_profiling_compressed); + +typedef u16 pgalloc_tag_idx; + union pgtag_ref_handle { union codetag_ref *ref; /* reference in page extension */ + struct page *page; /* reference in page flags */ }; -extern struct page_ext_operations page_alloc_tagging_ops; +/* Reserved indexes */ +#define CODETAG_ID_NULL 0 +#define CODETAG_ID_EMPTY 1 +#define CODETAG_ID_FIRST 2 + +#ifdef CONFIG_MODULES + +extern struct alloc_tag_module_section module_tags; + +static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx) +{ + return &module_tags.first_tag[idx - kernel_tags.count]; +} + +static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag) +{ + return CODETAG_ID_FIRST + kernel_tags.count + (tag - module_tags.first_tag); +} + +#else /* CONFIG_MODULES */ + +static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx) +{ + pr_warn("invalid page tag reference %lu\n", (unsigned long)idx); + return NULL; +} + +static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag) +{ + pr_warn("invalid page tag 0x%lx\n", (unsigned long)tag); + return CODETAG_ID_NULL; +} + +#endif /* CONFIG_MODULES */ + +static inline void idx_to_ref(pgalloc_tag_idx idx, union codetag_ref *ref) +{ + switch (idx) { + case (CODETAG_ID_NULL): + ref->ct = NULL; + break; + case (CODETAG_ID_EMPTY): + set_codetag_empty(ref); + break; + default: + idx -= CODETAG_ID_FIRST; + ref->ct = idx < kernel_tags.count ? + &kernel_tags.first_tag[idx].ct : + &module_idx_to_tag(idx)->ct; + break; + } +} + +static inline pgalloc_tag_idx ref_to_idx(union codetag_ref *ref) +{ + struct alloc_tag *tag; + + if (!ref->ct) + return CODETAG_ID_NULL; + + if (is_codetag_empty(ref)) + return CODETAG_ID_EMPTY; + + tag = ct_to_alloc_tag(ref->ct); + if (tag >= kernel_tags.first_tag && tag < kernel_tags.first_tag + kernel_tags.count) + return CODETAG_ID_FIRST + (tag - kernel_tags.first_tag); + + return module_tag_to_idx(tag); +} + + /* Should be called only if mem_alloc_profiling_enabled() */ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, union pgtag_ref_handle *handle) { - struct page_ext *page_ext; - union codetag_ref *tmp; - if (!page) return false; - page_ext = page_ext_get(page); - if (!page_ext) - return false; + if (static_key_enabled(&mem_profiling_compressed)) { + pgalloc_tag_idx idx; + + idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask; + idx_to_ref(idx, ref); + handle->page = page; + } else { + struct page_ext *page_ext; + union codetag_ref *tmp; + + page_ext = page_ext_get(page); + if (!page_ext) + return false; + + tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); + ref->ct = tmp->ct; + handle->ref = tmp; + } - tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); - ref->ct = tmp->ct; - handle->ref = tmp; return true; } @@ -42,16 +131,35 @@ static inline void put_page_tag_ref(union pgtag_ref_handle handle) if (WARN_ON(!handle.ref)) return; - page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); + if (!static_key_enabled(&mem_profiling_compressed)) + page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); } -static inline void update_page_tag_ref(union pgtag_ref_handle handle, - union codetag_ref *ref) +static inline void update_page_tag_ref(union pgtag_ref_handle handle, union codetag_ref *ref) { - if (WARN_ON(!handle.ref || !ref)) - return; - - handle.ref->ct = ref->ct; + if (static_key_enabled(&mem_profiling_compressed)) { + struct page *page = handle.page; + unsigned long old_flags; + unsigned long flags; + unsigned long idx; + + if (WARN_ON(!page || !ref)) + return; + + idx = (unsigned long)ref_to_idx(ref); + idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs; + do { + old_flags = READ_ONCE(page->flags); + flags = old_flags; + flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs); + flags |= idx; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } else { + if (WARN_ON(!handle.ref || !ref)) + return; + + handle.ref->ct = ref->ct; + } } static inline void clear_page_tag_ref(struct page *page) @@ -122,6 +230,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } +void __init alloc_tag_sec_init(void); + #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void clear_page_tag_ref(struct page *page) {} @@ -130,6 +240,7 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} +static inline void alloc_tag_sec_init(void) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ -- cgit v1.2.3 From b7fc16a16b0850e41b88eae0edfa4c085c012347 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 24 Oct 2024 09:23:18 -0700 Subject: mm/codetag: uninline and move pgalloc_tag_copy and pgalloc_tag_split pgalloc_tag_copy() and pgalloc_tag_split() are sizable and outside of any performance-critical paths, so it should be fine to uninline them. Also move their declarations into pgalloc_tag.h which seems like a more appropriate place for them. No functional changes other than uninlining. Link: https://lkml.kernel.org/r/20241024162318.1640781-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Acked-by: Yu Zhao Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Signed-off-by: Andrew Morton --- include/linux/mm.h | 58 --------------------------------------------- include/linux/pgalloc_tag.h | 5 ++++ 2 files changed, 5 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f9120ac6d901..08e487e5850a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4166,62 +4166,4 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif -#ifdef CONFIG_MEM_ALLOC_PROFILING -static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) -{ - int i; - struct alloc_tag *tag; - unsigned int nr_pages = 1 << new_order; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = pgalloc_tag_get(&folio->page); - if (!tag) - return; - - for (i = nr_pages; i < (1 << old_order); i += nr_pages) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { - /* Set new reference to point to the original tag */ - alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} - -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) -{ - union pgtag_ref_handle handle; - union codetag_ref ref; - struct alloc_tag *tag; - - tag = pgalloc_tag_get(&old->page); - if (!tag) - return; - - if (!get_page_tag_ref(&new->page, &ref, &handle)) - return; - - /* Clear the old ref to the original allocation tag. */ - clear_page_tag_ref(&old->page); - /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(&ref, folio_size(new)); - __alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); -} -#else /* !CONFIG_MEM_ALLOC_PROFILING */ -static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) -{ -} - -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) -{ -} -#endif /* CONFIG_MEM_ALLOC_PROFILING */ - #endif /* _LINUX_MM_H */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 1fe63b52e5e5..0e43ab653ab6 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -230,6 +230,9 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } +void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); +void pgalloc_tag_copy(struct folio *new, struct folio *old); + void __init alloc_tag_sec_init(void); #else /* CONFIG_MEM_ALLOC_PROFILING */ @@ -241,6 +244,8 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ -- cgit v1.2.3 From d7cb6d7414ea1b33536fa6d11805cb8dceec1f97 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 7 Nov 2024 15:42:59 +0900 Subject: block: RCU protect disk->conv_zones_bitmap Ensure that a disk revalidation changing the conventional zones bitmap of a disk does not cause invalid memory references when using the disk_zone_is_conv() helper by RCU protecting the disk->conv_zones_bitmap pointer. disk_zone_is_conv() is modified to operate under the RCU read lock and the function disk_set_conv_zones_bitmap() is added to update a disk conv_zones_bitmap pointer using rcu_replace_pointer() with the disk zone_wplugs_lock spinlock held. disk_free_zone_resources() is modified to call disk_update_zone_resources() with a NULL bitmap pointer to free the disk conv_zones_bitmap. disk_set_conv_zones_bitmap() is also used in disk_update_zone_resources() to set the new (revalidated) bitmap and free the old one. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241107064300.227731-2-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7bfc877e159e..6d379803c777 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -195,7 +195,7 @@ struct gendisk { unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; - unsigned long *conv_zones_bitmap; + unsigned long __rcu *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; -- cgit v1.2.3 From f3d9bf05140dd242cdc33c431489a853f2bc1b67 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 7 Nov 2024 15:43:00 +0900 Subject: block: Add a public bdev_zone_is_seq() helper Turn the private disk_zone_is_conv() function in blk-zoned.c into a public and documented bdev_zone_is_seq() helper with the inverse polarity of the original function, also adding a check for non-zoned devices so that all file systems can use the helper, even with a regular block device. Suggested-by: Christoph Hellwig Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241107064300.227731-3-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6d379803c777..93551772c1d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1395,6 +1395,33 @@ static inline bool bdev_is_zone_start(struct block_device *bdev, return bdev_offset_from_zone_start(bdev, sector) == 0; } +/** + * bdev_zone_is_seq - check if a sector belongs to a sequential write zone + * @bdev: block device to check + * @sector: sector number + * + * Check if @sector on @bdev is contained in a sequential write required zone. + */ +static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + bool is_seq = false; + +#if IS_ENABLED(CONFIG_BLK_DEV_ZONED) + if (bdev_is_zoned(bdev)) { + struct gendisk *disk = bdev->bd_disk; + unsigned long *bitmap; + + rcu_read_lock(); + bitmap = rcu_dereference(disk->conv_zones_bitmap); + is_seq = !bitmap || + !test_bit(disk_zone_no(disk, sector), bitmap); + rcu_read_unlock(); + } +#endif + + return is_seq; +} + static inline int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; -- cgit v1.2.3 From f7470591f8db1a72ce9f7ab49cb13c2b21b92002 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:12 +0100 Subject: mm: convert page_to_pgoff() to page_pgoff() Patch series "page->index removals in mm", v2. As part of shrinking struct page, we need to stop using page->index. This patchset gets rid of most of the remaining references to page->index in mm, as well as increasing the number of functions which take a const folio/page pointer. It shrinks the text segment of mm by a few hundred bytes in my test config, probably mostly from removing calls to compound_head() in page_to_pgoff(). This patch (of 7): Change the function signature to pass in the folio as all three callers have it. This removes a reference to page->index, which we're trying to get rid of. And add kernel-doc. Link: https://lkml.kernel.org/r/20241005200121.3231142-1-willy@infradead.org Link: https://lkml.kernel.org/r/20241005200121.3231142-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/pagemap.h | 31 +++++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 08e487e5850a..78848fbefe94 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1895,7 +1895,7 @@ static inline unsigned long page_to_section(const struct page *page) * * Return: The Page Frame Number of the first page in the folio. */ -static inline unsigned long folio_pfn(struct folio *folio) +static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 68a5f1ff3301..bcf0865a38ae 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1011,22 +1011,25 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping, return read_cache_folio(mapping, index, NULL, file); } -/* - * Get the offset in PAGE_SIZE (even for hugetlb pages). +/** + * page_pgoff - Calculate the logical page offset of this page. + * @folio: The folio containing this page. + * @page: The page which we need the offset of. + * + * For file pages, this is the offset from the beginning of the file + * in units of PAGE_SIZE. For anonymous pages, this is the offset from + * the beginning of the anon_vma in units of PAGE_SIZE. This will + * return nonsense for KSM pages. + * + * Context: Caller must have a reference on the folio or otherwise + * prevent it from being split or freed. + * + * Return: The offset in units of PAGE_SIZE. */ -static inline pgoff_t page_to_pgoff(struct page *page) +static inline pgoff_t page_pgoff(const struct folio *folio, + const struct page *page) { - struct page *head; - - if (likely(!PageTransTail(page))) - return page->index; - - head = compound_head(page); - /* - * We don't initialize ->index for tail pages: calculate based on - * head page - */ - return head->index + page - head; + return folio->index + folio_page_idx(folio, page); } /* -- cgit v1.2.3 From 713da0b33b3e9d16272b57f4c44dee5c052be9b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:14 +0100 Subject: mm: renovate page_address_in_vma() This function doesn't modify any of its arguments, so if we make a few other functions take const pointers, we can make page_address_in_vma() take const pointers too. All of its callers have the containing folio already, so pass that in as an argument instead of recalculating it. Also add kernel-doc Link: https://lkml.kernel.org/r/20241005200121.3231142-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d5e93e44322e..78923015a2e8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -728,11 +728,8 @@ page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) } bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); - -/* - * Used by swapoff to help locate where page is expected in vma. - */ -unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); +unsigned long page_address_in_vma(const struct folio *folio, + const struct page *, const struct vm_area_struct *); /* * Cleans the PTEs of shared mappings. -- cgit v1.2.3 From 68158bfa3dbd4af8461ef75a91ffc03be942c8fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:15 +0100 Subject: mm: mass constification of folio/page pointers Now that page_pgoff() takes const pointers, we can constify the pointers to a lot of functions. Link: https://lkml.kernel.org/r/20241005200121.3231142-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/ksm.h | 7 ++++--- include/linux/rmap.h | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 29022e71a074..6a53ac4885bb 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -90,7 +90,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); -void collect_procs_ksm(struct folio *folio, struct page *page, +void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); @@ -122,8 +122,9 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } -static inline void collect_procs_ksm(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static inline void collect_procs_ksm(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 78923015a2e8..683a04088f3f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -171,7 +171,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, unlink_anon_vmas(next); } -struct anon_vma *folio_get_anon_vma(struct folio *folio); +struct anon_vma *folio_get_anon_vma(const struct folio *folio); /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; @@ -194,8 +194,8 @@ enum rmap_level { RMAP_LEVEL_PMD, }; -static inline void __folio_rmap_sanity_checks(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) +static inline void __folio_rmap_sanity_checks(const struct folio *folio, + const struct page *page, int nr_pages, enum rmap_level level) { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); @@ -771,14 +771,14 @@ struct rmap_walk_control { bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct folio *folio); - struct anon_vma *(*anon_lock)(struct folio *folio, + struct anon_vma *(*anon_lock)(const struct folio *folio, struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, +struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ -- cgit v1.2.3 From 0386aaa6e9c826bc494169a914e01a86befe6edf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:16 +0100 Subject: bootmem: stop using page->index Encode the type into the bottom four bits of page->private and the info into the remaining bits. Also turn the bootmem type into a named enum. [arnd@arndb.de: bootmem: add bootmem_type stub function] Link: https://lkml.kernel.org/r/20241015143802.577613-1-arnd@kernel.org [akpm@linux-foundation.org: fix build with !CONFIG_HAVE_BOOTMEM_INFO_NODE] Link: https://lore.kernel.org/oe-kbuild-all/202410090311.eaqcL7IZ-lkp@intel.com/ Link: https://lkml.kernel.org/r/20241005200121.3231142-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arnd Bergmann Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/bootmem_info.h | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index cffa38a73618..d8a8d245824a 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -6,11 +6,10 @@ #include /* - * Types for free bootmem stored in page->lru.next. These have to be in - * some random range in unsigned long space for debugging purposes. + * Types for free bootmem stored in the low bits of page->private. */ -enum { - MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, +enum bootmem_type { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 1, SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, MIX_SECTION_INFO, NODE_INFO, @@ -21,9 +20,19 @@ enum { void __init register_page_bootmem_info_node(struct pglist_data *pgdat); void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type); + enum bootmem_type type); void put_page_bootmem(struct page *page); +static inline enum bootmem_type bootmem_type(const struct page *page) +{ + return (unsigned long)page->private & 0xf; +} + +static inline unsigned long bootmem_info(const struct page *page) +{ + return (unsigned long)page->private >> 4; +} + /* * Any memory allocated via the memblock allocator and not via the * buddy will be marked reserved already in the memmap. For those @@ -31,7 +40,7 @@ void put_page_bootmem(struct page *page); */ static inline void free_bootmem_page(struct page *page) { - unsigned long magic = page->index; + enum bootmem_type type = bootmem_type(page); /* * The reserve_bootmem_region sets the reserved flag on bootmem @@ -39,7 +48,7 @@ static inline void free_bootmem_page(struct page *page) */ VM_BUG_ON_PAGE(page_ref_count(page) != 2, page); - if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) + if (type == SECTION_INFO || type == MIX_SECTION_INFO) put_page_bootmem(page); else VM_BUG_ON_PAGE(1, page); @@ -53,8 +62,18 @@ static inline void put_page_bootmem(struct page *page) { } +static inline enum bootmem_type bootmem_type(const struct page *page) +{ + return SECTION_INFO; +} + +static inline unsigned long bootmem_info(const struct page *page) +{ + return 0; +} + static inline void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) + enum bootmem_type type) { } -- cgit v1.2.3 From 6a78699838a0ddeed3620ddf50c1521f1fe1e811 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 31 Oct 2024 21:37:19 +0800 Subject: block: always verify unfreeze lock on the owner task commit f1be1788a32e ("block: model freeze & enter queue as lock for supporting lockdep") tries to apply lockdep for verifying freeze & unfreeze. However, the verification is only done the outmost freeze and unfreeze. This way is actually not correct because q->mq_freeze_depth still may drop to zero on other task instead of the freeze owner task. Fix this issue by always verifying the last unfreeze lock on the owner task context, and make sure both the outmost freeze & unfreeze are verified in the current task. Fixes: f1be1788a32e ("block: model freeze & enter queue as lock for supporting lockdep") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241031133723.303835-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 93551772c1d6..1b51a7c92e9b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -575,6 +575,10 @@ struct request_queue { struct throtl_data *td; #endif struct rcu_head rcu_head; +#ifdef CONFIG_LOCKDEP + struct task_struct *mq_freeze_owner; + int mq_freeze_owner_depth; +#endif wait_queue_head_t mq_freeze_wq; /* * Protect concurrent access to q_usage_counter by -- cgit v1.2.3 From 9c477088b60dd23f3120b2b01b14d85047a033da Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 6 Nov 2024 21:19:59 +0100 Subject: net: phy: make genphy_c45_write_eee_adv() static genphy_c45_write_eee_adv() isn't used outside phy-c45.c, so make it static. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/d23bd784-44e6-4a15-af3a-b37379156521@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index bf0eb4e5d35c..bb09a5a128e1 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1953,7 +1953,6 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); -int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); -- cgit v1.2.3 From db73835f54fc57bc267d43836905588f24ddac85 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 6 Nov 2024 21:25:12 +0100 Subject: net: phy: remove genphy_config_eee_advert bcm_config_lre_aneg() doesn't use genphy_config_eee_advert() any longer. As this was the only user, we can remove genphy_config_eee_advert() now. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/37da7f3e-b883-4c07-9881-b8c0516822b7@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index bb09a5a128e1..1e4127c495c0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1885,7 +1885,6 @@ int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart); -int genphy_config_eee_advert(struct phy_device *phydev); int __genphy_config_aneg(struct phy_device *phydev, bool changed); int genphy_aneg_done(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); -- cgit v1.2.3 From b45a3777ceabbe08ab7a6e97f258191c07cbab8d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:13:52 +0800 Subject: iommu: Pass old domain to set_dev_pasid op To support domain replacement for pasid, the underlying iommu driver needs to know the old domain hence be able to clean up the existing attachment. It would be much convenient for iommu layer to pass down the old domain. Otherwise, iommu drivers would need to track domain for pasids by themselves, this would duplicate code among the iommu drivers. Or iommu drivers would rely group->pasid_array to get domain, which may not always the correct one. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Nicolin Chen Reviewed-by: Vasant Hegde Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-2-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bd722f473635..32dce80aa7fd 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -642,7 +642,7 @@ struct iommu_ops { struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, - ioasid_t pasid); + ioasid_t pasid, struct iommu_domain *old); int (*map_pages)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, -- cgit v1.2.3 From 980e3016ebcc00021bf8190c64fd65ba23e90498 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 8 Nov 2024 10:14:04 +0800 Subject: iommu: Make set_dev_pasid op support domain replacement The iommu core is going to support domain replacement for pasid, it needs to make the set_dev_pasid op support replacing domain and keep the old domain config in the failure case. AMD iommu driver does not support domain replacement for pasid yet, so it would fail the set_dev_pasid op to keep the old config if the input @old is non-NULL. Till now, all the set_dev_pasid callbacks can handle the old parameter and can keep the old config when failed, so update the kdoc of set_dev_pasid op. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241107122234.7424-14-yi.l.liu@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 32dce80aa7fd..27f923450a7c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -616,7 +616,8 @@ struct iommu_ops { * * EBUSY - device is attached to a domain and cannot be changed * * ENODEV - device specific errors, not able to be attached * * - treated as ENODEV by the caller. Use is discouraged - * @set_dev_pasid: set an iommu domain to a pasid of device + * @set_dev_pasid: set or replace an iommu domain to a pasid of device. The pasid of + * the device should be left in the old config in error case. * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. * @unmap_pages: unmap a number of pages of the same size from an iommu domain -- cgit v1.2.3 From d1f4390dd28ba110f232615dc4610ac1bb2f39f2 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 8 Nov 2024 16:07:37 +0200 Subject: regmap: provide regmap_assign_bits() Add another bits helper to regmap API: this one sets given bits if value is true and clears them if it's false. Suggested-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Signed-off-by: Tomi Valkeinen Link: https://patch.msgid.link/20241108-assign-bits-v1-1-382790562d99@ideasonboard.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index da07584284ab..0d5b74f9bd32 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1335,6 +1335,15 @@ static inline int regmap_clear_bits(struct regmap *map, return regmap_update_bits_base(map, reg, bits, 0, NULL, false, false); } +static inline int regmap_assign_bits(struct regmap *map, unsigned int reg, + unsigned int bits, bool value) +{ + if (value) + return regmap_set_bits(map, reg, bits); + else + return regmap_clear_bits(map, reg, bits); +} + int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits); /** @@ -1803,6 +1812,13 @@ static inline int regmap_clear_bits(struct regmap *map, return -EINVAL; } +static inline int regmap_assign_bits(struct regmap *map, unsigned int reg, + unsigned int bits, bool value) +{ + WARN_ONCE(1, "regmap API is disabled"); + return -EINVAL; +} + static inline int regmap_test_bits(struct regmap *map, unsigned int reg, unsigned int bits) { -- cgit v1.2.3 From 06cf321aadef17c7b1578369e314193c0e1c7d8e Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 7 Nov 2024 14:58:19 -0600 Subject: range: Add range_overlaps() Code to support CXL Dynamic Capacity devices will have extent ranges which need to be compared for intersection not a subset as is being checked in range_contains(). range_overlaps() is defined in btrfs with a different meaning from what is required in the standard range code. Dan Williams pointed this out in [1]. Adjust the btrfs call according to his suggestion there. Then add a generic range_overlaps(). Cc: Dan Williams Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Cc: linux-btrfs@vger.kernel.org Link: https://lore.kernel.org/all/65949f79ef908_8dc68294f2@dwillia2-xfh.jf.intel.com.notmuch/ [1] Acked-by: David Sterba Reviewed-by: Davidlohr Bueso Reviewed-by: Johannes Thumshirn Reviewed-by: Fan Ni Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Ira Weiny Link: https://patch.msgid.link/20241107-dcd-type2-upstream-v7-1-56a84e66bc36@intel.com Signed-off-by: Dave Jiang --- include/linux/range.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/range.h b/include/linux/range.h index 6ad0b73cb7ad..876cd5355158 100644 --- a/include/linux/range.h +++ b/include/linux/range.h @@ -13,11 +13,19 @@ static inline u64 range_len(const struct range *range) return range->end - range->start + 1; } +/* True if r1 completely contains r2 */ static inline bool range_contains(struct range *r1, struct range *r2) { return r1->start <= r2->start && r1->end >= r2->end; } +/* True if any part of r1 overlaps r2 */ +static inline bool range_overlaps(const struct range *r1, + const struct range *r2) +{ + return r1->start <= r2->end && r1->end >= r2->start; +} + int add_range(struct range *range, int az, int nr_range, u64 start, u64 end); -- cgit v1.2.3 From d1b3dad9de7962f7bea861f27e352bac17b68bed Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:17 -0800 Subject: iommufd: Move struct iommufd_object to public iommufd header Prepare for an embedded structure design for driver-level iommufd_viommu objects: // include/linux/iommufd.h struct iommufd_viommu { struct iommufd_object obj; .... }; // Some IOMMU driver struct iommu_driver_viommu { struct iommufd_viommu core; .... }; It has to expose struct iommufd_object and enum iommufd_object_type from the core-level private header to the public iommufd header. Link: https://patch.msgid.link/r/54a43b0768089d690104530754f499ca05ce0074.1730836219.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 30f832a60ccb..22948dd03d67 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -8,6 +8,7 @@ #include #include +#include #include struct device; @@ -18,6 +19,29 @@ struct iommufd_ctx; struct iommufd_device; struct page; +enum iommufd_object_type { + IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_DEVICE, + IOMMUFD_OBJ_HWPT_PAGING, + IOMMUFD_OBJ_HWPT_NESTED, + IOMMUFD_OBJ_IOAS, + IOMMUFD_OBJ_ACCESS, + IOMMUFD_OBJ_FAULT, +#ifdef CONFIG_IOMMUFD_TEST + IOMMUFD_OBJ_SELFTEST, +#endif + IOMMUFD_OBJ_MAX, +}; + +/* Base struct for all objects with a userspace ID handle. */ +struct iommufd_object { + refcount_t shortterm_users; + refcount_t users; + enum iommufd_object_type type; + unsigned int id; +}; + struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); void iommufd_device_unbind(struct iommufd_device *idev); -- cgit v1.2.3 From e8e26a0b09f5783be471b5ffb1e31822b1272c1d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 15 Oct 2024 22:27:31 +0200 Subject: nfs: Annotate struct pnfs_commit_array with __counted_by() Add the __counted_by compiler attribute to the flexible array member buckets to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Compile-tested only. Signed-off-by: Thorsten Blum Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 12d8e47bc5a3..559273a0f16d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1336,7 +1336,7 @@ struct pnfs_commit_array { struct rcu_head rcu; refcount_t refcount; unsigned int nbuckets; - struct pnfs_commit_bucket buckets[]; + struct pnfs_commit_bucket buckets[] __counted_by(nbuckets); }; struct pnfs_ds_commit_info { -- cgit v1.2.3 From e32c260195e6ff72940ab7826e38e0a0066fc58f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Nov 2024 11:49:04 -1000 Subject: sched_ext: Enable the ops breather and eject BPF scheduler on softlockup On 2 x Intel Sapphire Rapids machines with 224 logical CPUs, a poorly behaving BPF scheduler can live-lock the system by making multiple CPUs bang on the same DSQ to the point where soft-lockup detection triggers before SCX's own watchdog can take action. It also seems possible that the machine can be live-locked enough to prevent scx_ops_helper, which is an RT task, from running in a timely manner. Implement scx_softlockup() which is called when three quarters of soft-lockup threshold has passed. The function immediately enables the ops breather and triggers an ops error to initiate ejection of the BPF scheduler. The previous and this patch combined enable the kernel to reliably recover the system from live-lock conditions that can be triggered by a poorly behaving BPF scheduler on Intel dual socket systems. Signed-off-by: Tejun Heo Cc: Douglas Anderson Cc: Andrew Morton --- include/linux/sched/ext.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1ddbde64a31b..65bc0a489cd2 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -205,11 +205,13 @@ struct sched_ext_entity { void sched_ext_free(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); +void scx_softlockup(u32 dur_s); #else /* !CONFIG_SCHED_CLASS_EXT */ static inline void sched_ext_free(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} +static inline void scx_softlockup(u32 dur_s) {} #endif /* CONFIG_SCHED_CLASS_EXT */ #endif /* _LINUX_SCHED_EXT_H */ -- cgit v1.2.3 From 20fd1383cd616d61b2a79967da1221dc6cfb8430 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Thu, 7 Nov 2024 18:52:33 +0000 Subject: iio: Move __private marking before struct element priv in struct iio_dev This is to avoid tripping up kernel-doc which filters it out before but not after the name. Note the formatting is less than ideal as a result so we may revisit in future. Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 59c58f455311..ae65890d4567 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -624,7 +624,7 @@ struct iio_dev { const struct iio_info *info; const struct iio_buffer_setup_ops *setup_ops; - void *priv __private; + void *__private priv; }; int iio_device_id(struct iio_dev *indio_dev); -- cgit v1.2.3 From fffb9fff12250018a6f4d3e411f9d289210da329 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Nov 2024 11:34:24 +0200 Subject: gpio: Get rid of GPIOF_ACTIVE_LOW No more users. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20241104093609.156059-7-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 2d105be7bbc3..6270150f4e29 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -21,9 +21,6 @@ struct device; #define GPIOF_OUT_INIT_LOW ((0 << 0) | (0 << 1)) #define GPIOF_OUT_INIT_HIGH ((0 << 0) | (1 << 1)) -/* Gpio pin is active-low */ -#define GPIOF_ACTIVE_LOW (1 << 2) - /** * struct gpio - a structure describing a GPIO with configuration * @gpio: the GPIO number -- cgit v1.2.3 From f7f52738637f4361c108cad36e23ee98959a9006 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:43 +0000 Subject: neighbour: Create netdev->neighbour association Create a mapping between a netdev and its neighoburs, allowing for much cheaper flushes. Signed-off-by: Gilad Naaman Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241107160444.2913124-7-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3c552b648b27..df4483598628 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,6 +52,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -2032,6 +2033,9 @@ enum netdev_reg_state { * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. * + * @neighbours: List heads pointing to this device's neighbours' + * dev_list, one per address-family. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2440,6 +2444,9 @@ struct net_device { */ struct net_shaper_hierarchy *net_shaper_hierarchy; #endif + + struct hlist_head neighbours[NEIGH_NR_TABLES]; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; -- cgit v1.2.3 From 79bc0af904db647979c735563299c9b0d820e432 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 10 Oct 2024 21:35:42 +0200 Subject: hwmon: Add static visibility member to struct hwmon_ops Several drivers return the same static value in their is_visible callback, what results in code duplication. Therefore add an option for drivers to specify a static visibility directly. Signed-off-by: Heiner Kallweit Message-ID: <89690b81-2c73-47ae-9ae9-45c77b45ca0c@gmail.com> groeck: Renamed hwmon_ops_is_visible -> hwmon_is_visible Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 5c6a421ad580..3a63dff62d03 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -368,7 +368,9 @@ enum hwmon_intrusion_attributes { /** * struct hwmon_ops - hwmon device operations - * @is_visible: Callback to return attribute visibility. Mandatory. + * @visible: Static visibility. If non-zero, 'is_visible' is ignored. + * @is_visible: Callback to return attribute visibility. Mandatory unless + * 'visible' is non-zero. * Parameters are: * @const void *drvdata: * Pointer to driver-private data structure passed @@ -412,6 +414,7 @@ enum hwmon_intrusion_attributes { * The function returns 0 on success or a negative error number. */ struct hwmon_ops { + umode_t visible; umode_t (*is_visible)(const void *drvdata, enum hwmon_sensor_types type, u32 attr, int channel); int (*read)(struct device *dev, enum hwmon_sensor_types type, -- cgit v1.2.3 From 7506ebcd662b868780774d191a7c024c18c557a8 Mon Sep 17 00:00:00 2001 From: Naresh Solanki Date: Mon, 7 Oct 2024 14:34:24 +0530 Subject: hwmon: (max6639) : Configure based on DT property Remove platform data & initialize with defaults configuration & overwrite based on DT properties. Signed-off-by: Naresh Solanki Message-ID: <20241007090426.811736-1-naresh.solanki@9elements.com> [groeck: Dropped some unnecessary empty lines] Signed-off-by: Guenter Roeck --- include/linux/platform_data/max6639.h | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 include/linux/platform_data/max6639.h (limited to 'include/linux') diff --git a/include/linux/platform_data/max6639.h b/include/linux/platform_data/max6639.h deleted file mode 100644 index 65bfdb4fdc15..000000000000 --- a/include/linux/platform_data/max6639.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_MAX6639_H -#define _LINUX_MAX6639_H - -#include - -/* platform data for the MAX6639 temperature sensor and fan control */ - -struct max6639_platform_data { - bool pwm_polarity; /* Polarity low (0) or high (1, default) */ - int ppr; /* Pulses per rotation 1..4 (default == 2) */ - int rpm_range; /* 2000, 4000 (default), 8000 or 16000 */ -}; - -#endif /* _LINUX_MAX6639_H */ -- cgit v1.2.3 From e7ac4daeed91a25382091e73818ea0cddb1afd5e Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 7 Nov 2024 14:12:46 +1300 Subject: mm: count zeromap read and set for swapout and swapin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the proportion of folios from the zeromap is small, missing their accounting may not significantly impact profiling. However, it's easy to construct a scenario where this becomes an issue—for example, allocating 1 GB of memory, writing zeros from userspace, followed by MADV_PAGEOUT, and then swapping it back in. In this case, the swap-out and swap-in counts seem to vanish into a black hole, potentially causing semantic ambiguity. On the other hand, Usama reported that zero-filled pages can exceed 10% in workloads utilizing zswap, while Hailong noted that some app in Android have more than 6% zero-filled pages. Before commit 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap"), both zswap and zRAM implemented similar optimizations, leading to these optimized-out pages being counted in either zswap or zRAM counters (with pswpin/pswpout also increasing for zRAM). With zeromap functioning prior to both zswap and zRAM, userspace will no longer detect these swap-out and swap-in actions. We have three ways to address this: 1. Introduce a dedicated counter specifically for the zeromap. 2. Use pswpin/pswpout accounting, treating the zero map as a standard backend. This approach aligns with zRAM's current handling of same-page fills at the device level. However, it would mean losing the optimized-out page counters previously available in zRAM and would not align with systems using zswap. Additionally, as noted by Nhat Pham, pswpin/pswpout counters apply only to I/O done directly to the backend device. 3. Count zeromap pages under zswap, aligning with system behavior when zswap is enabled. However, this would not be consistent with zRAM, nor would it align with systems lacking both zswap and zRAM. Given the complications with options 2 and 3, this patch selects option 1. We can find these counters from /proc/vmstat (counters for the whole system) and memcg's memory.stat (counters for the interested memcg). For example: $ grep -E 'swpin_zero|swpout_zero' /proc/vmstat swpin_zero 1648 swpout_zero 33536 $ grep -E 'swpin_zero|swpout_zero' /sys/fs/cgroup/system.slice/memory.stat swpin_zero 3905 swpout_zero 3985 This patch does not address any specific zeromap bug, but the missing swpout and swpin counts for zero-filled pages can be highly confusing and may mislead user-space agents that rely on changes in these counters as indicators. Therefore, we add a Fixes tag to encourage the inclusion of this counter in any kernel versions with zeromap. Many thanks to Kanchana for the contribution of changing count_objcg_event() to count_objcg_events() to support large folios[1], which has now been incorporated into this patch. [1] https://lkml.kernel.org/r/20241001053222.6944-5-kanchana.p.sridhar@intel.com Link: https://lkml.kernel.org/r/20241107011246.59137-1-21cnbao@gmail.com Fixes: 0ca0c24e3211 ("mm: store zero pages to be swapped out in a bitmap") Co-developed-by: Kanchana P Sridhar Signed-off-by: Barry Song Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: Usama Arif Cc: Yosry Ahmed Cc: Hailong Liu Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Cc: Andi Kleen Cc: Baolin Wang Cc: Chris Li Cc: "Huang, Ying" Cc: Kairui Song Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 +++++++----- include/linux/vm_event_item.h | 2 ++ 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 34d2da05f2f1..e1b41554a5fb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1760,8 +1760,9 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); -static inline void count_objcg_event(struct obj_cgroup *objcg, - enum vm_event_item idx) +static inline void count_objcg_events(struct obj_cgroup *objcg, + enum vm_event_item idx, + unsigned long count) { struct mem_cgroup *memcg; @@ -1770,7 +1771,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); - count_memcg_events(memcg, idx, 1); + count_memcg_events(memcg, idx, count); rcu_read_unlock(); } @@ -1825,8 +1826,9 @@ static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) return NULL; } -static inline void count_objcg_event(struct obj_cgroup *objcg, - enum vm_event_item idx) +static inline void count_objcg_events(struct obj_cgroup *objcg, + enum vm_event_item idx, + unsigned long count) { } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index aed952d04132..f70d0958095c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -134,6 +134,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, + SWPIN_ZERO, + SWPOUT_ZERO, #ifdef CONFIG_KSM KSM_SWPIN_COPY, #endif -- cgit v1.2.3 From 69bad21551c9caea8c58800f96da48a704fd311e Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:16 -0700 Subject: mm: define obj_cgroup_get() if CONFIG_MEMCG is not defined Patch series "mm: zswap swap-out of large folios", v10. This patch series enables zswap_store() to accept and store large folios. The most significant contribution in this series is from the earlier RFC submitted by Ryan Roberts [1]. Ryan's original RFC has been migrated to mm-unstable as of 9-30-2024 in patch 6 of this series, and adapted based on code review comments received for the current patch-series. [1]: [RFC PATCH v1] mm: zswap: Store large folios without splitting https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u The first few patches do the prep work for supporting large folios in zswap_store. Patch 6 provides the main functionality to swap-out large folios in zswap. Patch 7 adds sysfs per-order hugepages "zswpout" counters that get incremented upon successful zswap_store of large folios, and also updates the documentation for this: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout This patch series is a prerequisite for zswap compress batching of large folio swap-out and decompress batching of swap-ins based on swapin_readahead(), using Intel IAA hardware acceleration, which we would like to submit in subsequent patch-series, with performance improvement data. Thanks to Ying Huang for pre-posting review feedback and suggestions! Thanks also to Nhat, Yosry, Johannes, Barry, Chengming, Usama, Ying and Matthew for their helpful feedback, code/data reviews and suggestions! Co-development signoff request: =============================== I would like to thank Ryan Roberts for his original RFC [1] and request his co-developer signoff on patch 6 in this series. Thanks Ryan! System setup for testing: ========================= Testing of this patch series was done with mm-unstable as of 9-27-2024, commit de2fbaa6d9c3576ec7133ed02a370ec9376bf000 (without this patch-series) and mm-unstable 9-30-2024 commit c121617e3606be6575cdacfdb63cc8d67b46a568 (with this patch-series). Data was gathered on an Intel Sapphire Rapids server, dual-socket 56 cores per socket, 4 IAA devices per socket, 503 GiB RAM and 525G SSD disk partition swap. Core frequency was fixed at 2500MHz. The vm-scalability "usemem" test was run in a cgroup whose memory.high was fixed at 150G. The is no swap limit set for the cgroup. 30 usemem processes were run, each allocating and writing 10G of memory, and sleeping for 10 sec before exiting: usemem --init-time -w -O -s 10 -n 30 10g Other kernel configuration parameters: zswap compressors : zstd, deflate-iaa zswap allocator : zsmalloc vm.page-cluster : 2 In the experiments where "deflate-iaa" is used as the zswap compressor, IAA "compression verification" is enabled by default (cat /sys/bus/dsa/drivers/crypto/verify_compress). Hence each IAA compression will be decompressed internally by the "iaa_crypto" driver, the crc-s returned by the hardware will be compared and errors reported in case of mismatches. Thus "deflate-iaa" helps ensure better data integrity as compared to the software compressors, and the experimental data listed below is with verify_compress set to "1". Metrics reporting methodology: ============================== Total and average throughput are derived from the individual 30 processes' throughputs reported by usemem. elapsed/sys times are measured with perf. All percentage changes are "new" vs. "old"; hence a positive value denotes an increase in the metric, whether it is throughput or latency, and a negative value denotes a reduction in the metric. Positive throughput change percentages and negative latency change percentages denote improvements. The vm stats and sysfs hugepages stats included with the performance data provide details on the swapout activity to zswap/swap device. Testing labels used in data summaries: ====================================== The data refers to these test configurations and the before/after comparisons that they do: before-case1: ------------- mm-unstable 9-27-2024, CONFIG_THP_SWAP=N (compares zswap 4K vs. zswap 64K) In this scenario, CONFIG_THP_SWAP=N results in 64K/2M folios to be split into 4K folios that get processed by zswap. before-case2: ------------- mm-unstable 9-27-2024, CONFIG_THP_SWAP=Y (compares SSD swap large folios vs. zswap large folios) In this scenario, CONFIG_THP_SWAP=Y results in zswap rejecting large folios, which will then be stored by the SSD swap device. after: ------ v10 of this patch-series, CONFIG_THP_SWAP=Y The "after" is CONFIG_THP_SWAP=Y and v10 of this patch-series, that results in 64K/2M folios to not be split, and to be processed by zswap_store. Regression Testing: =================== I ran vm-scalability usemem without large folios, i.e., only 4K folios with mm-unstable and this patch-series. The main goal was to make sure that there is no functional or performance regression wrt the earlier zswap behavior for 4K folios, now that 4K folios will be processed by the new zswap_store() code. The data indicates there is no significant regression. ------------------------------------------------------------------------------- 4K folios: ========== zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 4,793,363 4,880,978 4,853,074 1% -1% Average throughput (KB/s) 159,778 162,699 161,769 1% -1% elapsed time (sec) 130.14 123.17 126.29 -3% 3% sys time (sec) 3,135.53 2,985.64 3,083.18 -2% 3% memcg_high 446,826 444,626 452,930 memcg_swap_fail 0 0 0 zswpout 48,932,107 48,931,971 48,931,820 zswpin 383 386 397 pswpout 0 0 0 pswpin 0 0 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 0 0 0 pgmajfault 3,063 3,077 3,479 swap_ra 93 94 96 swap_ra_hit 47 47 50 ZSWPOUT-64kB n/a n/a 0 SWPOUT-64kB 0 0 0 ------------------------------------------------------------------------------- Performance Testing: ==================== We list the data for 64K folios with before/after data per-compressor, followed by the same for 2M pmd-mappable folios. ------------------------------------------------------------------------------- 64K folios: zstd: ================= zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,222,213 1,076,611 6,159,776 18% 472% Average throughput (KB/s) 174,073 35,887 205,325 18% 472% elapsed time (sec) 120.50 347.16 108.33 -10% -69% sys time (sec) 2,930.33 248.16 2,549.65 -13% 927% memcg_high 416,773 552,200 465,874 memcg_swap_fail 3,192,906 1,293 1,012 zswpout 48,931,583 20,903 48,931,218 zswpin 384 363 410 pswpout 0 40,778,448 0 pswpin 0 16 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 3,192,906 1,293 1,012 pgmajfault 3,452 3,072 3,061 swap_ra 90 87 107 swap_ra_hit 42 43 57 ZSWPOUT-64kB n/a n/a 3,057,173 SWPOUT-64kB 0 2,548,653 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 64K folios: deflate-iaa: ======================== zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,652,608 1,089,180 7,189,778 27% 560% Average throughput (KB/s) 188,420 36,306 239,659 27% 560% elapsed time (sec) 102.90 343.35 87.05 -15% -75% sys time (sec) 2,246.86 213.53 1,864.16 -17% 773% memcg_high 576,104 502,907 642,083 memcg_swap_fail 4,016,117 1,407 1,478 zswpout 61,163,423 22,444 57,798,716 zswpin 401 368 454 pswpout 0 40,862,080 0 pswpin 0 20 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 4,016,117 1,407 1,478 pgmajfault 3,063 3,153 3,122 swap_ra 96 93 156 swap_ra_hit 46 45 83 ZSWPOUT-64kB n/a n/a 3,611,032 SWPOUT-64kB 0 2,553,880 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 2M folios: zstd: ================ zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,895,500 1,109,694 6,484,224 10% 484% Average throughput (KB/s) 196,516 36,989 216,140 10% 484% elapsed time (sec) 108.77 334.28 106.33 -2% -68% sys time (sec) 2,657.14 94.88 2,376.13 -11% 2404% memcg_high 64,200 66,316 56,898 memcg_swap_fail 101,182 70 27 zswpout 48,931,499 36,507 48,890,640 zswpin 380 379 377 pswpout 0 40,166,400 0 pswpin 0 0 0 thp_swpout 0 78,450 0 thp_swpout_fallback 101,182 70 27 2MB-mthp_swpout_fallback 0 0 27 pgmajfault 3,067 3,417 3,311 swap_ra 91 90 854 swap_ra_hit 45 45 810 ZSWPOUT-2MB n/a n/a 95,459 SWPOUT-2MB 0 78,450 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 2M folios: deflate-iaa: ======================= zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 6,286,587 1,126,785 7,073,464 13% 528% Average throughput (KB/s) 209,552 37,559 235,782 13% 528% elapsed time (sec) 96.19 333.03 85.79 -11% -74% sys time (sec) 2,141.44 99.96 1,826.67 -15% 1727% memcg_high 99,253 64,666 79,718 memcg_swap_fail 129,074 53 165 zswpout 61,312,794 28,321 56,045,120 zswpin 383 406 403 pswpout 0 40,048,128 0 pswpin 0 0 0 thp_swpout 0 78,219 0 thp_swpout_fallback 129,074 53 165 2MB-mthp_swpout_fallback 0 0 165 pgmajfault 3,430 3,077 31,468 swap_ra 91 103 84,373 swap_ra_hit 47 46 84,317 ZSWPOUT-2MB n/a n/a 109,229 SWPOUT-2MB 0 78,219 0 ------------------------------------------------------------------------------- And finally, this is a comparison of deflate-iaa vs. zstd with v10 of this patch-series: --------------------------------------------- zswap_store large folios v10 Impr w/ deflate-iaa vs. zstd 64K folios 2M folios --------------------------------------------- Throughput (KB/s) 17% 9% elapsed time (sec) -20% -19% sys time (sec) -27% -23% --------------------------------------------- Conclusions based on the performance results: ============================================= v10 wrt before-case1: --------------------- We see significant improvements in throughput, elapsed and sys time for zstd and deflate-iaa, when comparing before-case1 (THP_SWAP=N) vs. after (THP_SWAP=Y) with zswap_store large folios. v10 wrt before-case2: --------------------- We see even more significant improvements in throughput and elapsed time for zstd and deflate-iaa, when comparing before-case2 (large-folio-SSD) vs. after (large-folio-zswap). The sys time increases with large-folio-zswap as expected, due to the CPU compression time vs. asynchronous disk write times, as pointed out by Ying and Yosry. In before-case2, when zswap does not store large folios, only allocations and cgroup charging due to 4K folio zswap stores count towards the cgroup memory limit. However, in the after scenario, with the introduction of zswap_store() of large folios, there is an added component of the zswap compressed pool usage from large folio stores from potentially all 30 processes, that gets counted towards the memory limit. As a result, we see higher swapout activity in the "after" data. Summary: ======== The v10 data presented above shows that zswap_store of large folios demonstrates good throughput/performance improvements compared to conventional SSD swap of large folios with a sufficiently large 525G SSD swap device. Hence, it seems reasonable for zswap_store to support large folios, so that further performance improvements can be implemented. In the experimental setup used in this patchset, we have enabled IAA compress verification to ensure additional hardware data integrity CRC checks not currently done by the software compressors. We see good throughput/latency improvements with deflate-iaa vs. zstd with zswap_store of large folios. Some of the ideas for further reducing latency that have shown promise in our experiments, are: 1) IAA compress/decompress batching. 2) Distributing compress jobs across all IAA devices on the socket. The tests run for this patchset are using only 1 IAA device per core, that avails of 2 compress engines on the device. In our experiments with IAA batching, we distribute compress jobs from all cores to the 8 compress engines available per socket. We further compress the pages in each folio in parallel in the accelerator. As a result, we improve compress latency and reclaim throughput. In decompress batching, we use swapin_readahead to generate a prefetch batch of 4K folios that we decompress in parallel in IAA. ------------------------------------------------------------------------------ IAA compress/decompress batching Further improvements wrt v10 zswap_store Sequential subpage store using "deflate-iaa": "deflate-iaa" Batching "deflate-iaa-canned" [2] Batching Additional Impr Additional Impr 64K folios 2M folios 64K folios 2M folios ------------------------------------------------------------------------------ Throughput (KB/s) 19% 43% 26% 55% elapsed time (sec) -5% -14% -10% -21% sys time (sec) 4% -7% -4% -18% ------------------------------------------------------------------------------ With zswap IAA compress/decompress batching, we are able to demonstrate significant performance improvements and memory savings in server scalability experiments in highly contended system scenarios under significant memory pressure; as compared to software compressors. We hope to submit this work in subsequent patch series. The current patch-series is a prequisite for these future submissions. [1] https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u [2] https://patchwork.kernel.org/project/linux-crypto/cover/cover.1710969449.git.andre.glover@linux.intel.com/ This patch (of 6): This resolves an issue with obj_cgroup_get() not being defined if CONFIG_MEMCG is not defined. Before this patch, we would see build errors if obj_cgroup_get() is called from code that is agnostic of CONFIG_MEMCG. The zswap_store() changes for large folios in subsequent commits will require the use of obj_cgroup_get() in zswap code that falls into this category. Link: https://lkml.kernel.org/r/20241001053222.6944-1-kanchana.p.sridhar@intel.com Link: https://lkml.kernel.org/r/20241001053222.6944-2-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Reviewed-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0bd8f61a5597..5502aa8e138e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1233,6 +1233,10 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) return NULL; } +static inline void obj_cgroup_get(struct obj_cgroup *objcg) +{ +} + static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } -- cgit v1.2.3 From 6e1fa555ec772046ec3b903f507ff7fed5323796 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:20 -0700 Subject: mm: zswap: modify zswap_stored_pages to be atomic_long_t For zswap_store() to support large folios, we need to be able to do a batch update of zswap_stored_pages upon successful store of all pages in the folio. For this, we need to add folio_nr_pages(), which returns a long, to zswap_stored_pages. Link: https://lkml.kernel.org/r/20241001053222.6944-6-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Acked-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- include/linux/zswap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 9cd1beef0654..d961ead91bf1 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -7,7 +7,7 @@ struct lruvec; -extern atomic_t zswap_stored_pages; +extern atomic_long_t zswap_stored_pages; #ifdef CONFIG_ZSWAP -- cgit v1.2.3 From 0c560dd86040556a9e55d88229d9295672428c78 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:22 -0700 Subject: mm: swap: count successful large folio zswap stores in hugepage zswpout stats Added a new MTHP_STAT_ZSWPOUT entry to the sysfs transparent_hugepage stats so that successful large folio zswap stores can be accounted under the per-order sysfs "zswpout" stats: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout Other non-zswap swap device swap-out events will be counted under the existing sysfs "swpout" stats: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/swpout Also, added documentation for the newly added sysfs per-order hugepage "zswpout" stats. The documentation clarifies that only non-zswap swapouts will be accounted in the existing "swpout" stats. Link: https://lkml.kernel.org/r/20241001053222.6944-8-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: Yosry Ahmed Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 006f730545c2..c59e5aa9b081 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -119,6 +119,7 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, -- cgit v1.2.3 From aaf2914aec0fa67395574f6fa6b726168b049e60 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 26 Oct 2024 21:24:23 +1300 Subject: mm: add per-order mTHP swpin counters This helps profile the sizes of folios being swapped in. Currently, only mTHP swap-out is being counted. The new interface can be found at: /sys/kernel/mm/transparent_hugepage/hugepages-/stats swpin For example, cat /sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpin 12809 cat /sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpin 4763 [v-songbaohua@oppo.com: add a blank line in doc] Link: https://lkml.kernel.org/r/20241030233423.80759-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20241026082423.26298-1-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Chris Li Cc: Yosry Ahmed Cc: "Huang, Ying" Cc: Kairui Song Cc: Ryan Roberts Cc: Kanchana P Sridhar Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c59e5aa9b081..b94c2e8ee918 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -120,6 +120,7 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, + MTHP_STAT_SWPIN, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, -- cgit v1.2.3 From 5f6170a469cd2c13ad4dffe42714cf777b132451 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Oct 2024 14:13:27 +0000 Subject: mm: pagewalk: add the ability to install PTEs Patch series "implement lightweight guard pages", v4. Userland library functions such as allocators and threading implementations often require regions of memory to act as 'guard pages' - mappings which, when accessed, result in a fatal signal being sent to the accessing process. The current means by which these are implemented is via a PROT_NONE mmap() mapping, which provides the required semantics however incur an overhead of a VMA for each such region. With a great many processes and threads, this can rapidly add up and incur a significant memory penalty. It also has the added problem of preventing merges that might otherwise be permitted. This series takes a different approach - an idea suggested by Vlastimil Babka (and before him David Hildenbrand and Jann Horn - perhaps more - the provenance becomes a little tricky to ascertain after this - please forgive any omissions!) - rather than locating the guard pages at the VMA layer, instead placing them in page tables mapping the required ranges. Early testing of the prototype version of this code suggests a 5 times speed up in memory mapping invocations (in conjunction with use of process_madvise()) and a 13% reduction in VMAs on an entirely idle android system and unoptimised code. We expect with optimisation and a loaded system with a larger number of guard pages this could significantly increase, but in any case these numbers are encouraging. This way, rather than having separate VMAs specifying which parts of a range are guard pages, instead we have a VMA spanning the entire range of memory a user is permitted to access and including ranges which are to be 'guarded'. After mapping this, a user can specify which parts of the range should result in a fatal signal when accessed. By restricting the ability to specify guard pages to memory mapped by existing VMAs, we can rely on the mappings being torn down when the mappings are ultimately unmapped and everything works simply as if the memory were not faulted in, from the point of view of the containing VMAs. This mechanism in effect poisons memory ranges similar to hardware memory poisoning, only it is an entirely software-controlled form of poisoning. The mechanism is implemented via madvise() behaviour - MADV_GUARD_INSTALL which installs page table-level guard page markers - and MADV_GUARD_REMOVE - which clears them. Guard markers can be installed across multiple VMAs and any existing mappings will be cleared, that is zapped, before installing the guard page markers in the page tables. There is no concept of 'nested' guard markers, multiple attempts to install guard markers in a range will, after the first attempt, have no effect. Importantly, removing guard markers over a range that contains both guard markers and ordinary backed memory has no effect on anything but the guard markers (including leaving huge pages un-split), so a user can safely remove guard markers over a range of memory leaving the rest intact. The actual mechanism by which the page table entries are specified makes use of existing logic - PTE markers, which are used for the userfaultfd UFFDIO_POISON mechanism. Unfortunately PTE_MARKER_POISONED is not suited for the guard page mechanism as it results in VM_FAULT_HWPOISON semantics in the fault handler, so we add our own specific PTE_MARKER_GUARD and adapt existing logic to handle it. We also extend the generic page walk mechanism to allow for installation of PTEs (carefully restricted to memory management logic only to prevent unwanted abuse). We ensure that zapping performed by MADV_DONTNEED and MADV_FREE do not remove guard markers, nor does forking (except when VM_WIPEONFORK is specified for a VMA which implies a total removal of memory characteristics). It's important to note that the guard page implementation is emphatically NOT a security feature, so a user can remove the markers if they wish. We simply implement it in such a way as to provide the least surprising behaviour. An extensive set of self-tests are provided which ensure behaviour is as expected and additionally self-documents expected behaviour of guard ranges. This patch (of 5): The existing generic pagewalk logic permits the walking of page tables, invoking callbacks at individual page table levels via user-provided mm_walk_ops callbacks. This is useful for traversing existing page table entries, but precludes the ability to establish new ones. Existing mechanism for performing a walk which also installs page table entries if necessary are heavily duplicated throughout the kernel, each with semantic differences from one another and largely unavailable for use elsewhere. Rather than add yet another implementation, we extend the generic pagewalk logic to enable the installation of page table entries by adding a new install_pte() callback in mm_walk_ops. If this is specified, then upon encountering a missing page table entry, we allocate and install a new one and continue the traversal. If a THP huge page is encountered at either the PMD or PUD level we split it only if there are ops->pte_entry() (or ops->pmd_entry at PUD level), otherwise if there is only an ops->install_pte(), we avoid the unnecessary split. We do not support hugetlb at this stage. If this function returns an error, or an allocation fails during the operation, we abort the operation altogether. It is up to the caller to deal appropriately with partially populated page table ranges. If install_pte() is defined, the semantics of pte_entry() change - this callback is then only invoked if the entry already exists. This is a useful property, as it allows a caller to handle existing PTEs while installing new ones where necessary in the specified range. If install_pte() is not defined, then there is no functional difference to this patch, so all existing logic will work precisely as it did before. As we only permit the installation of PTEs where a mapping does not already exist there is no need for TLB management, however we do invoke update_mmu_cache() for architectures which require manual maintenance of mappings for other CPUs. We explicitly do not allow the existing page walk API to expose this feature as it is dangerous and intended for internal mm use only. Therefore we provide a new walk_page_range_mm() function exposed only to mm/internal.h. We take the opportunity to additionally clean up the page walker logic to be a little easier to follow. Link: https://lkml.kernel.org/r/cover.1730123433.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/51b432ebef013e3fdf9f92101533435de1bffadf.1730123433.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jann Horn Reviewed-by: Vlastimil Babka Suggested-by: Vlastimil Babka Suggested-by: Jann Horn Suggested-by: David Hildenbrand Cc: Arnd Bergmann Cc: Christian Brauner Cc: Christoph Hellwig Cc: Chris Zankel Cc: Helge Deller Cc: James E.J. Bottomley Cc: Jeff Xu Cc: John Hubbard Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Matt Turner Cc: Max Filippov Cc: Muchun Song Cc: Paul E. McKenney Cc: Richard Henderson Cc: Shuah Khan Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index f5eb5a32aeed..9700a29f8afb 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -25,12 +25,15 @@ enum page_walk_lock { * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each PTE (lowest-level) entry, - * including empty ones + * @pte_entry: if set, called for each PTE (lowest-level) entry + * including empty ones, except if @install_pte is set. + * If @install_pte is set, @pte_entry is called only for + * existing PTEs. * @pte_hole: if set, called for each hole at all levels, * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. * Any folded depths (where PTRS_PER_P?D is equal to 1) - * are skipped. + * are skipped. If @install_pte is specified, this will + * not trigger for any populated ranges. * @hugetlb_entry: if set, called for each hugetlb entry. This hook * function is called with the vma lock held, in order to * protect against a concurrent freeing of the pte_t* or @@ -51,6 +54,13 @@ enum page_walk_lock { * @pre_vma: if set, called before starting walk on a non-null vma. * @post_vma: if set, called after a walk on a non-null vma, provided * that @pre_vma and the vma walk succeeded. + * @install_pte: if set, missing page table entries are installed and + * thus all levels are always walked in the specified + * range. This callback is then invoked at the PTE level + * (having split any THP pages prior), providing the PTE to + * install. If allocations fail, the walk is aborted. This + * operation is only available for userland memory. Not + * usable for hugetlb ranges. * * p?d_entry callbacks are called even if those levels are folded on a * particular architecture/configuration. @@ -76,6 +86,8 @@ struct mm_walk_ops { int (*pre_vma)(unsigned long start, unsigned long end, struct mm_walk *walk); void (*post_vma)(struct mm_walk *walk); + int (*install_pte)(unsigned long addr, unsigned long next, + pte_t *ptep, struct mm_walk *walk); enum page_walk_lock walk_lock; }; -- cgit v1.2.3 From 7c53dfbdb024915f23f03f972b33744309d4608b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Oct 2024 14:13:28 +0000 Subject: mm: add PTE_MARKER_GUARD PTE marker Add a new PTE marker that results in any access causing the accessing process to segfault. This is preferable to PTE_MARKER_POISONED, which results in the same handling as hardware poisoned memory, and is thus undesirable for cases where we simply wish to 'soft' poison a range. This is in preparation for implementing the ability to specify guard pages at the page table level, i.e. ranges that, when accessed, should cause process termination. Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single purpose was simply incorrect. We then reuse the same logic to determine whether a zap should clear a guard entry - this should only be performed on teardown and never on MADV_DONTNEED or MADV_FREE. We additionally add a WARN_ON_ONCE() in hugetlb logic should a guard marker be encountered there, as we explicitly do not support this operation and this should not occur. Link: https://lkml.kernel.org/r/f47f3d5acca2dcf9bbf655b6d33f3dc713e4a4a0.1730123433.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Suggested-by: Vlastimil Babka Suggested-by: Jann Horn Suggested-by: David Hildenbrand Cc: Arnd Bergmann Cc: Christian Brauner Cc: Christoph Hellwig Cc: Chris Zankel Cc: Helge Deller Cc: James E.J. Bottomley Cc: Jeff Xu Cc: John Hubbard Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Matt Turner Cc: Max Filippov Cc: Muchun Song Cc: Paul E. McKenney Cc: Richard Henderson Cc: Shuah Khan Cc: Shuah Khan Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 +- include/linux/swapops.h | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 355cf46a01a6..1b6a917fffa4 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -544,7 +544,7 @@ static inline pte_marker copy_pte_marker( { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ - pte_marker dstm = srcm & PTE_MARKER_POISONED; + pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index cb468e418ea1..96f26e29fefe 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -426,9 +426,19 @@ typedef unsigned long pte_marker; * "Poisoned" here is meant in the very general sense of "future accesses are * invalid", instead of referring very specifically to hardware memory errors. * This marker is meant to represent any of various different causes of this. + * + * Note that, when encountered by the faulting logic, PTEs with this marker will + * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error + * logic. */ #define PTE_MARKER_POISONED BIT(1) -#define PTE_MARKER_MASK (BIT(2) - 1) +/* + * Indicates that, on fault, this PTE will case a SIGSEGV signal to be + * sent. This means guard markers behave in effect as if the region were mapped + * PROT_NONE, rather than if they were a memory hole or equivalent. + */ +#define PTE_MARKER_GUARD BIT(2) +#define PTE_MARKER_MASK (BIT(3) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { @@ -464,6 +474,18 @@ static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_POISONED); + +} + +static inline swp_entry_t make_guard_swp_entry(void) +{ + return make_pte_marker_entry(PTE_MARKER_GUARD); +} + +static inline int is_guard_swp_entry(swp_entry_t entry) +{ + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_GUARD); } /* -- cgit v1.2.3 From ab6e8e74e47362bd9d79dd4394a167b2afe0cc77 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 27 Oct 2024 13:14:42 -0700 Subject: mm: delete the unused put_pages_list() The last user of put_pages_list() converted away from it in 6.10 commit 06c375053cef ("iommu/vt-d: add wrapper functions for page allocations"): delete put_pages_list(). Link: https://lkml.kernel.org/r/d9985d6a-293e-176b-e63d-82fdfd28c139@google.com Signed-off-by: Hugh Dickins Acked-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Matthew Wilcox (Oracle) Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 78848fbefe94..32888a97ab44 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1286,8 +1286,6 @@ static inline struct folio *virt_to_folio(const void *x) void __folio_put(struct folio *folio); -void put_pages_list(struct list_head *pages); - void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); int folio_mc_copy(struct folio *dst, struct folio *src); -- cgit v1.2.3 From 48b50624aec454ce0fa8f78ef96e2f43bc0be495 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 Oct 2024 12:33:38 +0200 Subject: backing-file: clean up the API - Pass iocb to ctx->end_write() instead of file + pos - Get rid of ctx->user_file, which is redundant most of the time - Instead pass iocb to backing_file_splice_read and backing_file_splice_write Signed-off-by: Miklos Szeredi Signed-off-by: Amir Goldstein --- include/linux/backing-file.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 2eed0ffb5e8f..1476a6ed1bfd 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -14,9 +14,8 @@ struct backing_file_ctx { const struct cred *cred; - struct file *user_file; - void (*accessed)(struct file *); - void (*end_write)(struct file *, loff_t, ssize_t); + void (*accessed)(struct file *file); + void (*end_write)(struct kiocb *iocb, ssize_t); }; struct file *backing_file_open(const struct path *user_path, int flags, @@ -31,13 +30,13 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx); -ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, +ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb, struct pipe_inode_info *pipe, size_t len, unsigned int flags, struct backing_file_ctx *ctx); ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, loff_t *ppos, size_t len, - unsigned int flags, + struct file *out, struct kiocb *iocb, + size_t len, unsigned int flags, struct backing_file_ctx *ctx); int backing_file_mmap(struct file *file, struct vm_area_struct *vma, struct backing_file_ctx *ctx); -- cgit v1.2.3 From 49dffdfde462c7823de6ed882f71ce233aaeba63 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Wed, 6 Nov 2024 16:57:17 -0800 Subject: cred: Add a light version of override/revert_creds() Add a light version of override/revert_creds(), this should only be used when the credentials in question will outlive the critical section and the critical section doesn't change the ->usage of the credentials. Suggested-by: Christian Brauner Signed-off-by: Vinicius Costa Gomes Signed-off-by: Amir Goldstein --- include/linux/cred.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 2976f534a7a3..e4a3155fe409 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -172,6 +172,24 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) cred->cap_inheritable)); } +/* + * Override creds without bumping reference count. Caller must ensure + * reference remains valid or has taken reference. Almost always not the + * interface you want. Use override_creds()/revert_creds() instead. + */ +static inline const struct cred *override_creds_light(const struct cred *override_cred) +{ + const struct cred *old = current->cred; + + rcu_assign_pointer(current->cred, override_cred); + return old; +} + +static inline void revert_creds_light(const struct cred *revert_cred) +{ + rcu_assign_pointer(current->cred, revert_cred); +} + /** * get_new_cred_many - Get references on a new set of credentials * @cred: The new credentials to reference -- cgit v1.2.3 From 5c2e7736e20d9b348a44cafbfa639fe2653fbc34 Mon Sep 17 00:00:00 2001 From: Eder Zulian Date: Thu, 7 Nov 2024 17:32:23 +0100 Subject: rust: helpers: Avoid raw_spin_lock initialization for PREEMPT_RT When PREEMPT_RT=y, spin locks are mapped to rt_mutex types, so using spinlock_check() + __raw_spin_lock_init() to initialize spin locks is incorrect, and would cause build errors. Introduce __spin_lock_init() to initialize a spin lock with lockdep rquired information for PREEMPT_RT builds, and use it in the Rust helper. Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.") Closes: https://lore.kernel.org/oe-kbuild-all/202409251238.vetlgXE9-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Eder Zulian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Boqun Feng Tested-by: Boqun Feng Link: https://lore.kernel.org/r/20241107163223.2092690-2-ezulian@redhat.com --- include/linux/spinlock_rt.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index f9f14e135be7..f6499c37157d 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -16,22 +16,21 @@ static inline void __rt_spin_lock_init(spinlock_t *lock, const char *name, } #endif -#define spin_lock_init(slock) \ +#define __spin_lock_init(slock, name, key, percpu) \ do { \ - static struct lock_class_key __key; \ - \ rt_mutex_base_init(&(slock)->lock); \ - __rt_spin_lock_init(slock, #slock, &__key, false); \ + __rt_spin_lock_init(slock, name, key, percpu); \ } while (0) -#define local_spin_lock_init(slock) \ +#define _spin_lock_init(slock, percpu) \ do { \ static struct lock_class_key __key; \ - \ - rt_mutex_base_init(&(slock)->lock); \ - __rt_spin_lock_init(slock, #slock, &__key, true); \ + __spin_lock_init(slock, #slock, &__key, percpu); \ } while (0) +#define spin_lock_init(slock) _spin_lock_init(slock, false) +#define local_spin_lock_init(slock) _spin_lock_init(slock, true) + extern void rt_spin_lock(spinlock_t *lock) __acquires(lock); extern void rt_spin_lock_nested(spinlock_t *lock, int subclass) __acquires(lock); extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock) __acquires(lock); -- cgit v1.2.3 From c554aa9ca976480839af342204e05bb4ce8367d5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 1 Nov 2024 22:13:33 +0100 Subject: uprobes: Re-order struct uprobe_task to save some space On x86_64, with allmodconfig, struct uprobe_task is 72 bytes long, with a hole and some padding. /* size: 72, cachelines: 2, members: 7 */ /* sum members: 64, holes: 1, sum holes: 4 */ /* padding: 4 */ /* forced alignments: 1, forced holes: 1, sum forced holes: 4 */ /* last cacheline: 8 bytes */ Reorder the structure to fill the hole and avoid the padding. This way, the whole structure fits in a single cacheline and some memory is saved when it is allocated. /* size: 64, cachelines: 1, members: 7 */ /* forced alignments: 1 */ Signed-off-by: Christophe JAILLET Signed-off-by: Peter Zijlstra (Intel) Acked-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/a9f541d0cedf421f765c77a1fb93d6a979778a88.1730495562.git.christophe.jaillet@wanadoo.fr --- include/linux/uprobes.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 7a051b5d2edd..e0a4c2082245 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -121,6 +121,9 @@ struct hprobe { struct uprobe_task { enum uprobe_task_state state; + unsigned int depth; + struct return_instance *return_instances; + union { struct { struct arch_uprobe_task autask; @@ -138,9 +141,6 @@ struct uprobe_task { unsigned long xol_vaddr; struct arch_uprobe *auprobe; - - struct return_instance *return_instances; - unsigned int depth; }; struct return_consumer { -- cgit v1.2.3 From ed76c07c6885b249ce8486dac22fb97151a83185 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Tue, 5 Nov 2024 16:45:08 -0300 Subject: printk: Introduce FORCE_CON flag Introduce FORCE_CON flag to printk. The new flag will make it possible to create a context where printk messages will never be suppressed. This mechanism will be used in the next patch to create a force_con context on sysrq handling, removing an existing workaround on the loglevel global variable. The workaround existed to make sure that sysrq header messages were sent to all consoles, but this doesn't work with deferred messages because the loglevel might be restored to its original value before a console flushes the messages. Signed-off-by: Marcos Paulo de Souza Reviewed-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20241105-printk-loud-con-v2-1-bd3ecdf7b0e4@suse.com Signed-off-by: Petr Mladek --- include/linux/printk.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index eca9bb2ee637..232e5fd06701 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -166,6 +166,9 @@ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); extern void __printk_deferred_enter(void); extern void __printk_deferred_exit(void); +extern void printk_force_console_enter(void); +extern void printk_force_console_exit(void); + /* * The printk_deferred_enter/exit macros are available only as a hack for * some code paths that need to defer all printk console printing. Interrupts -- cgit v1.2.3 From df3b8ca604f224eb4cd51669416ad4d607682273 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 4 Nov 2024 16:12:04 +0000 Subject: io_uring/cmd: let cmds to know about dying task When the taks that submitted a request is dying, a task work for that request might get run by a kernel thread or even worse by a half dismantled task. We can't just cancel the task work without running the callback as the cmd might need to do some clean up, so pass a flag instead. If set, it's not safe to access any task resources and the callback is expected to cancel the cmd ASAP. Reviewed-by: Jens Axboe Reviewed-by: Ming Lei Signed-off-by: Pavel Begunkov Signed-off-by: David Sterba --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4b9ba523978d..2ee5dc105b58 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -37,6 +37,7 @@ enum io_uring_cmd_flags { /* set when uring wants to cancel a previously issued command */ IO_URING_F_CANCEL = (1 << 11), IO_URING_F_COMPAT = (1 << 12), + IO_URING_F_TASK_DEAD = (1 << 13), }; struct io_wq_work_node { -- cgit v1.2.3 From 0ef2b9e698dbf9ba78f67952a747f35eb7060470 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:31 +0100 Subject: block: lift bio_is_zone_append to bio.h Make bio_is_zone_append globally available, because file systems need to use to check for a zone append bio in their end_io handlers to deal with the block layer emulation. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241104062647.91160-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 4a1bf43ca53d..60830a6a5939 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -675,6 +675,23 @@ static inline void bio_clear_polled(struct bio *bio) bio->bi_opf &= ~REQ_POLLED; } +/** + * bio_is_zone_append - is this a zone append bio? + * @bio: bio to check + * + * Check if @bio is a zone append operation. Core block layer code and end_io + * handlers must use this instead of an open coded REQ_OP_ZONE_APPEND check + * because the block layer can rewrite REQ_OP_ZONE_APPEND to REQ_OP_WRITE if + * it is not natively supported. + */ +static inline bool bio_is_zone_append(struct bio *bio) +{ + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + return false; + return bio_op(bio) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); +} + struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp); struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new); -- cgit v1.2.3 From f6b9a69a9e56b2083aca8a925fc1a28eb698e3ed Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 9 Nov 2024 15:14:29 -0800 Subject: bpf: Refactor active lock management When bpf_spin_lock was introduced originally, there was deliberation on whether to use an array of lock IDs, but since bpf_spin_lock is limited to holding a single lock at any given time, we've been using a single ID to identify the held lock. In preparation for introducing spin locks that can be taken multiple times, introduce support for acquiring multiple lock IDs. For this purpose, reuse the acquired_refs array and store both lock and pointer references. We tag the entry with REF_TYPE_PTR or REF_TYPE_LOCK to disambiguate and find the relevant entry. The ptr field is used to track the map_ptr or btf (for bpf_obj_new allocations) to ensure locks can be matched with protected fields within the same "allocation", i.e. bpf_obj_new object or map value. The struct active_lock is changed to an int as the state is part of the acquired_refs array, and we only need active_lock as a cheap way of detecting lock presence. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241109231430.2475236-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf_verifier.h | 53 +++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4513372c5bc8..d84beed92ae4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -48,22 +48,6 @@ enum bpf_reg_liveness { REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; -/* For every reg representing a map value or allocated object pointer, - * we consider the tuple of (ptr, id) for them to be unique in verifier - * context and conside them to not alias each other for the purposes of - * tracking lock state. - */ -struct bpf_active_lock { - /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, - * there's no active lock held, and other fields have no - * meaning. If non-NULL, it indicates that a lock is held and - * id member has the reg->id of the register which can be >= 0. - */ - void *ptr; - /* This will be reg->id */ - u32 id; -}; - #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { @@ -266,6 +250,13 @@ struct bpf_stack_state { }; struct bpf_reference_state { + /* Each reference object has a type. Ensure REF_TYPE_PTR is zero to + * default to pointer reference on zero initialization of a state. + */ + enum ref_state_type { + REF_TYPE_PTR = 0, + REF_TYPE_LOCK, + } type; /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). */ @@ -274,17 +265,23 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; - /* There can be a case like: - * main (frame 0) - * cb (frame 1) - * func (frame 3) - * cb (frame 4) - * Hence for frame 4, if callback_ref just stored boolean, it would be - * impossible to distinguish nested callback refs. Hence store the - * frameno and compare that to callback_ref in check_reference_leak when - * exiting a callback function. - */ - int callback_ref; + union { + /* There can be a case like: + * main (frame 0) + * cb (frame 1) + * func (frame 3) + * cb (frame 4) + * Hence for frame 4, if callback_ref just stored boolean, it would be + * impossible to distinguish nested callback refs. Hence store the + * frameno and compare that to callback_ref in check_reference_leak when + * exiting a callback function. + */ + int callback_ref; + /* Use to keep track of the source object of a lock, to ensure + * it matches on unlock. + */ + void *ptr; + }; }; struct bpf_retval_range { @@ -332,6 +329,7 @@ struct bpf_func_state { /* The following fields should be last. See copy_func_state() */ int acquired_refs; + int active_locks; struct bpf_reference_state *refs; /* The state of the stack. Each element of the array describes BPF_REG_SIZE * (i.e. 8) bytes worth of stack memory. @@ -434,7 +432,6 @@ struct bpf_verifier_state { u32 insn_idx; u32 curframe; - struct bpf_active_lock active_lock; bool speculative; bool active_rcu_lock; u32 active_preempt_lock; -- cgit v1.2.3 From ae6e3a273f590a2b64f14a9fab3546c3a8f44ed4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 9 Nov 2024 15:14:30 -0800 Subject: bpf: Drop special callback reference handling Logic to prevent callbacks from acquiring new references for the program (i.e. leaving acquired references), and releasing caller references (i.e. those acquired in parent frames) was introduced in commit 9d9d00ac29d0 ("bpf: Fix reference state management for synchronous callbacks"). This was necessary because back then, the verifier simulated each callback once (that could potentially be executed N times, where N can be zero). This meant that callbacks that left lingering resources or cleared caller resources could do it more than once, operating on undefined state or leaking memory. With the fixes to callback verification in commit ab5cfac139ab ("bpf: verify callbacks as if they are called unknown number of times"), all of this extra logic is no longer necessary. Hence, drop it as part of this commit. Cc: Eduard Zingerman Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241109231430.2475236-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf_verifier.h | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d84beed92ae4..3a74033d49c4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -265,23 +265,10 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; - union { - /* There can be a case like: - * main (frame 0) - * cb (frame 1) - * func (frame 3) - * cb (frame 4) - * Hence for frame 4, if callback_ref just stored boolean, it would be - * impossible to distinguish nested callback refs. Hence store the - * frameno and compare that to callback_ref in check_reference_leak when - * exiting a callback function. - */ - int callback_ref; - /* Use to keep track of the source object of a lock, to ensure - * it matches on unlock. - */ - void *ptr; - }; + /* Use to keep track of the source object of a lock, to ensure + * it matches on unlock. + */ + void *ptr; }; struct bpf_retval_range { -- cgit v1.2.3 From 559218d43ec9dde3d2847c7aa127e88d6ab1c9ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 Nov 2024 16:46:51 +0100 Subject: block: pre-calculate max_zone_append_sectors max_zone_append_sectors differs from all other queue limits in that the final value used is not stored in the queue_limits but needs to be obtained using queue_limits_max_zone_append_sectors helper. This not only adds (tiny) extra overhead to the I/O path, but also can be easily forgotten in file system code. Add a new max_hw_zone_append_sectors value to queue_limits which is set by the driver, and calculate max_zone_append_sectors from that and the other inputs in blk_validate_zoned_limits, similar to how max_sectors is calculated to fix this. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241104073955.112324-3-hch@lst.de Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20241108154657.845768-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1b51a7c92e9b..65f37ae70712 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -375,6 +375,7 @@ struct queue_limits { unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; + unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -1208,25 +1209,9 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int -queue_limits_max_zone_append_sectors(const struct queue_limits *l) -{ - unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); - - return min_not_zero(l->max_zone_append_sectors, max_sectors); -} - -static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) -{ - if (!blk_queue_is_zoned(q)) - return 0; - - return queue_limits_max_zone_append_sectors(&q->limits); -} - static inline bool queue_emulates_zone_append(struct request_queue *q) { - return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; + return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; } static inline bool bdev_emulates_zone_append(struct block_device *bdev) @@ -1237,7 +1222,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev) static inline unsigned int bdev_max_zone_append_sectors(struct block_device *bdev) { - return queue_max_zone_append_sectors(bdev_get_queue(bdev)); + return bdev_limits(bdev)->max_zone_append_sectors; } static inline unsigned int bdev_max_segments(struct block_device *bdev) -- cgit v1.2.3 From 5a47c2080a7316f184107464e4f76737c0c05186 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 6 Nov 2024 15:34:46 +0800 Subject: nvmet: support reservation feature This patch implements the reservation feature, including: 1. reservation register(register, unregister and replace). 2. reservation acquire(acquire, preempt, preempt and abort). 3. reservation release(release and clear). 4. reservation report. 5. set feature and get feature of reservation notify mask. 6. get log page of reservation event. Not supported: 1. persistent reservation through power loss. Test cases: Use nvme-cli and fio to test all implemented sub features: 1. use nvme resv-register to register host a registrant or unregister or replace a new key. 2. use nvme resv-acquire to set host to the holder, and use fio to send read and write io in all reservation type. And also test preempt and "preempt and abort". 3. use nvme resv-report to show all registrants and reservation status. 4. use nvme resv-release to release all registrants. 5. use nvme get-log to get events generated by the preceding operations. In addition, make reservation configurable, one can set ns to support reservation before enable ns. The default of resv_enable is false. Signed-off-by: Guixin Liu Reviewed-by: Dmitry Bogdanov Reviewed-by: Christoph Hellwig Tested-by: Chaitanya Kulkarni Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 44d048d68503..0179bb6d502d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2045,7 +2045,7 @@ enum { NVME_PR_LOG_EMPTY_LOG_PAGE = 0x00, NVME_PR_LOG_REGISTRATION_PREEMPTED = 0x01, NVME_PR_LOG_RESERVATION_RELEASED = 0x02, - NVME_PR_LOG_RESERVATOPM_PREEMPTED = 0x03, + NVME_PR_LOG_RESERVATOIN_PREEMPTED = 0x03, }; enum { -- cgit v1.2.3 From 61c9967cd63448292a64f9ee9aeb6e2053e3a624 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 4 Nov 2024 13:24:36 -0800 Subject: nvmet: implement active command set ns list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is required for nvme 2.1 for targets that support multiple command sets. We support NVM and ZNS, so are required to support this identification. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Reviewed-by: Matias Bjørling Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 0179bb6d502d..26de7c5c12be 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -522,6 +522,7 @@ enum { NVME_ID_CNS_NS_DESC_LIST = 0x03, NVME_ID_CNS_CS_NS = 0x05, NVME_ID_CNS_CS_CTRL = 0x06, + NVME_ID_CNS_NS_ACTIVE_LIST_CS = 0x07, NVME_ID_CNS_NS_CS_INDEP = 0x08, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, -- cgit v1.2.3 From 83acb24e6de7bbb5cb0df1ba0f47a92da9112061 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 4 Nov 2024 14:00:14 -0800 Subject: nvmet: implement supported log pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This log is required for nvme 2.1. Reviewed-by: Matias Bjørling Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 26de7c5c12be..e9e508bca60f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1245,6 +1245,7 @@ enum { NVME_FEAT_WRITE_PROTECT = 0x84, NVME_FEAT_VENDOR_START = 0xC0, NVME_FEAT_VENDOR_END = 0xFF, + NVME_LOG_SUPPORTED = 0x00, NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, @@ -1262,6 +1263,14 @@ enum { NVME_FWACT_ACTV = (2 << 3), }; +struct nvme_supported_log { + __le32 lids[256]; +}; + +enum { + NVME_LIDS_LSUPP = 1 << 0, +}; + /* NVMe Namespace Write Protect State */ enum { NVME_NS_NO_WRITE_PROTECT = 0, -- cgit v1.2.3 From e973c91727d49bb128c95210b3aa1960b9421d18 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 4 Nov 2024 14:07:42 -0800 Subject: nvmet: implement supported features log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This log is required for nvme 2.1. Reviewed-by: Matias Bjørling Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e9e508bca60f..31d7ec6d8b93 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1256,6 +1256,7 @@ enum { NVME_LOG_TELEMETRY_CTRL = 0x08, NVME_LOG_ENDURANCE_GROUP = 0x09, NVME_LOG_ANA = 0x0c, + NVME_LOG_FEATURES = 0x12, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), @@ -1271,6 +1272,16 @@ enum { NVME_LIDS_LSUPP = 1 << 0, }; +struct nvme_supported_features_log { + __le32 fis[256]; +}; + +enum { + NVME_FIS_FSUPP = 1 << 0, + NVME_FIS_NSCPE = 1 << 20, + NVME_FIS_CSCPE = 1 << 21, +}; + /* NVMe Namespace Write Protect State */ enum { NVME_NS_NO_WRITE_PROTECT = 0, -- cgit v1.2.3 From 266b652c65b44fb2ccfa17cdb54ce2ef723deb0a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 1 Nov 2024 14:46:01 -0700 Subject: nvmet: implement endurance groups Most of the returned information is just stubbed data. The target must support these in order to report rotational media. Since this driver doesn't know any better, each namespace is its own endurance group with the engid value matching the nsid. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 31d7ec6d8b93..6d5b4299a1b2 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -327,7 +327,8 @@ struct nvme_id_ctrl { __le32 sanicap; __le32 hmminds; __le16 hmmaxd; - __u8 rsvd338[4]; + __le16 nvmsetidmax; + __le16 endgidmax; __u8 anatt; __u8 anacap; __le32 anagrpmax; @@ -531,6 +532,7 @@ enum { NVME_ID_CNS_SCNDRY_CTRL_LIST = 0x15, NVME_ID_CNS_NS_GRANULARITY = 0x16, NVME_ID_CNS_UUID_LIST = 0x17, + NVME_ID_CNS_ENDGRP_LIST = 0x19, }; enum { @@ -618,6 +620,28 @@ enum { NVME_NIDT_CSI = 0x04, }; +struct nvme_endurance_group_log { + __u8 egcw; + __u8 egfeat; + __u8 rsvd2; + __u8 avsp; + __u8 avspt; + __u8 pused; + __le16 did; + __u8 rsvd8[24]; + __u8 ee[16]; + __u8 dur[16]; + __u8 duw[16]; + __u8 muw[16]; + __u8 hrc[16]; + __u8 hwc[16]; + __u8 mdie[16]; + __u8 neile[16]; + __u8 tegcap[16]; + __u8 uegcap[16]; + __u8 rsvd192[320]; +}; + struct nvme_smart_log { __u8 critical_warning; __u8 temperature[2]; @@ -1302,7 +1326,8 @@ struct nvme_identify { __u8 cns; __u8 rsvd3; __le16 ctrlid; - __u8 rsvd11[3]; + __le16 cnssid; + __u8 rsvd11; __u8 csi; __u32 rsvd12[4]; }; -- cgit v1.2.3 From 5fd075cdaf3649000677d960fd9e45c08081b7e0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 1 Nov 2024 13:48:47 -0700 Subject: nvmet: implement rotational media information log Most of the information is stubbed. Supporting these commands is a requirement for supporting rotational media. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 6d5b4299a1b2..99cf0ee73714 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -642,6 +642,18 @@ struct nvme_endurance_group_log { __u8 rsvd192[320]; }; +struct nvme_rotational_media_log { + __le16 endgid; + __le16 numa; + __le16 nrs; + __u8 rsvd6[2]; + __le32 spinc; + __le32 fspinc; + __le32 ldc; + __le32 fldc; + __u8 rsvd24[488]; +}; + struct nvme_smart_log { __u8 critical_warning; __u8 temperature[2]; @@ -1281,6 +1293,7 @@ enum { NVME_LOG_ENDURANCE_GROUP = 0x09, NVME_LOG_ANA = 0x0c, NVME_LOG_FEATURES = 0x12, + NVME_LOG_RMI = 0x16, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), @@ -1435,7 +1448,7 @@ struct nvme_get_log_page_command { __u8 lsp; /* upper 4 bits reserved */ __le16 numdl; __le16 numdu; - __u16 rsvd11; + __le16 lsi; union { struct { __le32 lpol; -- cgit v1.2.3 From e2758c76a0ab2032a0d11abc1c2ff08661fdf428 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 1 Nov 2024 12:29:40 -0700 Subject: nvmet: support for csi identify ns Implements reporting the I/O Command Set Independent Identify Namespace command. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 99cf0ee73714..c136d64c7d73 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -563,6 +563,7 @@ enum { NVME_NS_FLBAS_LBA_SHIFT = 1, NVME_NS_FLBAS_META_EXT = 0x10, NVME_NS_NMIC_SHARED = 1 << 0, + NVME_NS_ROTATIONAL = 1 << 4, NVME_LBAF_RP_BEST = 0, NVME_LBAF_RP_BETTER = 1, NVME_LBAF_RP_GOOD = 2, -- cgit v1.2.3 From 8a825d22a70915bd80c811fa93538cf2540af29d Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Mon, 4 Nov 2024 16:55:00 +0800 Subject: nvme: check ns's volatile write cache not present When the VWC of a namespace does not exist, the BLK_FEAT_WRITE_CACHE flag should not be set when registering the block device, regardless of whether the controller supports VWC. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c136d64c7d73..0a6e22038ce3 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -564,6 +564,7 @@ enum { NVME_NS_FLBAS_META_EXT = 0x10, NVME_NS_NMIC_SHARED = 1 << 0, NVME_NS_ROTATIONAL = 1 << 4, + NVME_NS_VWC_NOT_PRESENT = 1 << 5, NVME_LBAF_RP_BEST = 0, NVME_LBAF_RP_BETTER = 1, NVME_LBAF_RP_GOOD = 2, -- cgit v1.2.3 From 1af75b2ad08bd5977c51c2d0fc11741a4c0a48d9 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Sun, 10 Nov 2024 09:33:40 -0800 Subject: firmware: qcom: scm: Introduce CP_SMMU_APERTURE_ID The QCOM_SCM_SVC_MP service provides QCOM_SCM_MP_CP_SMMU_APERTURE_ID, which is used to trigger the mapping of register banks into the SMMU context for per-processes page tables to function (in case this isn't statically setup by firmware). This is necessary on e.g. QCS6490 Rb3Gen2, in order to avoid "CP | AHB bus error"-errors from the GPU. Introduce a function to allow the msm driver to invoke this call. Signed-off-by: Bjorn Andersson Reviewed-by: Rob Clark Link: https://lore.kernel.org/r/20241110-adreno-smmu-aparture-v2-1-9b1fb2ee41d4@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_scm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index 9f14976399ab..4621aec0328c 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -85,6 +85,8 @@ int qcom_scm_io_writel(phys_addr_t addr, unsigned int val); bool qcom_scm_restore_sec_cfg_available(void); int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare); +int qcom_scm_set_gpu_smmu_aperture(unsigned int context_bank); +bool qcom_scm_set_gpu_smmu_aperture_is_available(void); int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size); int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare); int qcom_scm_iommu_set_cp_pool_size(u32 spare, u32 size); -- cgit v1.2.3 From b376d519bd142c65ba9bba35db12b6be95b46893 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 29 Sep 2024 20:50:16 -0400 Subject: xdrgen: Implement big-endian enums Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 5f775e104f9a..a2ab813a9800 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -680,6 +680,27 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) return 0; } +/** + * xdr_stream_decode_be32 - Decode a big-endian 32-bit integer + * @xdr: pointer to xdr_stream + * @ptr: location to store integer + * + * Return values: + * %0 on success + * %-EBADMSG on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_decode_be32(struct xdr_stream *xdr, __be32 *ptr) +{ + const size_t count = sizeof(*ptr); + __be32 *p = xdr_inline_decode(xdr, count); + + if (unlikely(!p)) + return -EBADMSG; + *ptr = *p; + return 0; +} + /** * xdr_stream_decode_u64 - Decode a 64-bit integer * @xdr: pointer to xdr_stream -- cgit v1.2.3 From 631c2925bae41c11dcf3915a2ab5f3be9af54277 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Oct 2024 14:54:33 -0400 Subject: xdrgen: Keep track of on-the-wire data type widths The generic parts of the RPC layer need to know the widths (in XDR_UNIT increments) of the XDR data types defined for each protocol. As a first step, add dictionaries to keep track of the symbolic and actual maximum XDR width of XDR types. This makes it straightforward to look up the width of a type by its name. The built-in dictionaries are pre-loaded with the widths of the built-in XDR types as defined in RFC 4506. Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdrgen/_defs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdrgen/_defs.h b/include/linux/sunrpc/xdrgen/_defs.h index be9e62371758..20c7270aa64d 100644 --- a/include/linux/sunrpc/xdrgen/_defs.h +++ b/include/linux/sunrpc/xdrgen/_defs.h @@ -23,4 +23,13 @@ typedef struct { u8 *data; } opaque; +#define XDR_void (0) +#define XDR_bool (1) +#define XDR_int (1) +#define XDR_unsigned_int (1) +#define XDR_long (1) +#define XDR_unsigned_long (1) +#define XDR_hyper (2) +#define XDR_unsigned_hyper (2) + #endif /* _SUNRPC_XDRGEN__DEFS_H_ */ -- cgit v1.2.3 From 65941f10caf2c04781a7defa4ec0ab119dbd235a Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Mon, 28 Oct 2024 19:53:37 +0800 Subject: mm: move the page fragment allocator from page_alloc into its own file Inspired by [1], move the page fragment allocator from page_alloc into its own c file and header file, as we are about to make more change for it to replace another page_frag implementation in sock.c As this patchset is going to replace 'struct page_frag' with 'struct page_frag_cache' in sched.h, including page_frag_cache.h in sched.h has a compiler error caused by interdependence between mm_types.h and mm.h for asm-offsets.c, see [2]. So avoid the compiler error by moving 'struct page_frag_cache' to mm_types_task.h as suggested by Alexander, see [3]. 1. https://lore.kernel.org/all/20230411160902.4134381-3-dhowells@redhat.com/ 2. https://lore.kernel.org/all/15623dac-9358-4597-b3ee-3694a5956920@gmail.com/ 3. https://lore.kernel.org/all/CAKgT0UdH1yD=LSCXFJ=YM_aiA4OomD-2wXykO42bizaWMt_HOA@mail.gmail.com/ CC: David Howells CC: Linux-MM Signed-off-by: Yunsheng Lin Acked-by: Andrew Morton Reviewed-by: Alexander Duyck Link: https://patch.msgid.link/20241028115343.3405838-3-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- include/linux/gfp.h | 22 ---------------------- include/linux/mm_types.h | 18 ------------------ include/linux/mm_types_task.h | 18 ++++++++++++++++++ include/linux/page_frag_cache.h | 31 +++++++++++++++++++++++++++++++ include/linux/skbuff.h | 1 + 5 files changed, 50 insertions(+), 40 deletions(-) create mode 100644 include/linux/page_frag_cache.h (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a951de920e20..a0a6d25f883f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -371,28 +371,6 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); -struct page_frag_cache; -void page_frag_cache_drain(struct page_frag_cache *nc); -extern void __page_frag_cache_drain(struct page *page, unsigned int count); -void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, - gfp_t gfp_mask, unsigned int align_mask); - -static inline void *page_frag_alloc_align(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask, - unsigned int align) -{ - WARN_ON_ONCE(!is_power_of_2(align)); - return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align); -} - -static inline void *page_frag_alloc(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) -{ - return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); -} - -extern void page_frag_free(void *addr); - #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..92314ef2d978 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -521,9 +521,6 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); */ #define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page))) -#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) -#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) - /* * page_private can be used on tail pages. However, PagePrivate is only * checked by the VM on the head page. So page_private on the tail pages @@ -542,21 +539,6 @@ static inline void *folio_get_private(struct folio *folio) return folio->private; } -struct page_frag_cache { - void * va; -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - __u16 offset; - __u16 size; -#else - __u32 offset; -#endif - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_refcount every time we allocate a fragment. - */ - unsigned int pagecnt_bias; - bool pfmemalloc; -}; - typedef unsigned long vm_flags_t; /* diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index bff5706b76e1..0ac6daebdd5c 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -8,6 +8,7 @@ * (These are defined separately to decouple sched.h from mm_types.h as much as possible.) */ +#include #include #include @@ -43,6 +44,23 @@ struct page_frag { #endif }; +#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) +#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) +struct page_frag_cache { + void *va; +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + __u16 offset; + __u16 size; +#else + __u32 offset; +#endif + /* we maintain a pagecount bias, so that we dont dirty cache line + * containing page->_refcount every time we allocate a fragment. + */ + unsigned int pagecnt_bias; + bool pfmemalloc; +}; + /* Track pages that require TLB flushes */ struct tlbflush_unmap_batch { #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h new file mode 100644 index 000000000000..67ac8626ed9b --- /dev/null +++ b/include/linux/page_frag_cache.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_PAGE_FRAG_CACHE_H +#define _LINUX_PAGE_FRAG_CACHE_H + +#include +#include +#include + +void page_frag_cache_drain(struct page_frag_cache *nc); +void __page_frag_cache_drain(struct page *page, unsigned int count); +void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, + gfp_t gfp_mask, unsigned int align_mask); + +static inline void *page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align) +{ + WARN_ON_ONCE(!is_power_of_2(align)); + return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align); +} + +static inline void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) +{ + return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); +} + +void page_frag_free(void *addr); + +#endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e5095d75abba..60535c706851 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include -- cgit v1.2.3 From 3d18dfe69ce46f106af327736d2261d7e3ee81c0 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Mon, 28 Oct 2024 19:53:39 +0800 Subject: mm: page_frag: avoid caller accessing 'page_frag_cache' directly Use appropriate frag_page API instead of caller accessing 'page_frag_cache' directly. CC: Andrew Morton CC: Linux-MM Signed-off-by: Yunsheng Lin Reviewed-by: Alexander Duyck Acked-by: Chuck Lever Link: https://patch.msgid.link/20241028115343.3405838-5-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- include/linux/page_frag_cache.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h index 67ac8626ed9b..0a52f7a179c8 100644 --- a/include/linux/page_frag_cache.h +++ b/include/linux/page_frag_cache.h @@ -7,6 +7,16 @@ #include #include +static inline void page_frag_cache_init(struct page_frag_cache *nc) +{ + nc->va = NULL; +} + +static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) +{ + return !!nc->pfmemalloc; +} + void page_frag_cache_drain(struct page_frag_cache *nc); void __page_frag_cache_drain(struct page *page, unsigned int count); void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, -- cgit v1.2.3 From 0c3ce2f50261cd2f654d931eeb933c370a3a7d7a Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Mon, 28 Oct 2024 19:53:41 +0800 Subject: mm: page_frag: reuse existing space for 'size' and 'pfmemalloc' Currently there is one 'struct page_frag' for every 'struct sock' and 'struct task_struct', we are about to replace the 'struct page_frag' with 'struct page_frag_cache' for them. Before begin the replacing, we need to ensure the size of 'struct page_frag_cache' is not bigger than the size of 'struct page_frag', as there may be tens of thousands of 'struct sock' and 'struct task_struct' instances in the system. By or'ing the page order & pfmemalloc with lower bits of 'va' instead of using 'u16' or 'u32' for page size and 'u8' for pfmemalloc, we are able to avoid 3 or 5 bytes space waste. And page address & pfmemalloc & order is unchanged for the same page in the same 'page_frag_cache' instance, it makes sense to fit them together. After this patch, the size of 'struct page_frag_cache' should be the same as the size of 'struct page_frag'. CC: Andrew Morton CC: Linux-MM Signed-off-by: Yunsheng Lin Reviewed-by: Alexander Duyck Link: https://patch.msgid.link/20241028115343.3405838-7-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- include/linux/mm_types_task.h | 19 +++++++++++-------- include/linux/page_frag_cache.h | 24 ++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 0ac6daebdd5c..a82aa80c0ba4 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -47,18 +47,21 @@ struct page_frag { #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) struct page_frag_cache { - void *va; -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* encoded_page consists of the virtual address, pfmemalloc bit and + * order of a page. + */ + unsigned long encoded_page; + + /* we maintain a pagecount bias, so that we dont dirty cache line + * containing page->_refcount every time we allocate a fragment. + */ +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32) __u16 offset; - __u16 size; + __u16 pagecnt_bias; #else __u32 offset; + __u32 pagecnt_bias; #endif - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_refcount every time we allocate a fragment. - */ - unsigned int pagecnt_bias; - bool pfmemalloc; }; /* Track pages that require TLB flushes */ diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h index 0a52f7a179c8..41a91df82631 100644 --- a/include/linux/page_frag_cache.h +++ b/include/linux/page_frag_cache.h @@ -3,18 +3,38 @@ #ifndef _LINUX_PAGE_FRAG_CACHE_H #define _LINUX_PAGE_FRAG_CACHE_H +#include #include #include #include +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) +/* Use a full byte here to enable assembler optimization as the shift + * operation is usually expecting a byte. + */ +#define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) +#else +/* Compiler should be able to figure out we don't read things as any value + * ANDed with 0 is 0. + */ +#define PAGE_FRAG_CACHE_ORDER_MASK 0 +#endif + +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT (PAGE_FRAG_CACHE_ORDER_MASK + 1) + +static inline bool encoded_page_decode_pfmemalloc(unsigned long encoded_page) +{ + return !!(encoded_page & PAGE_FRAG_CACHE_PFMEMALLOC_BIT); +} + static inline void page_frag_cache_init(struct page_frag_cache *nc) { - nc->va = NULL; + nc->encoded_page = 0; } static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) { - return !!nc->pfmemalloc; + return encoded_page_decode_pfmemalloc(nc->encoded_page); } void page_frag_cache_drain(struct page_frag_cache *nc); -- cgit v1.2.3 From d2bd39c0456b75be9dfc7d774b8d021355c26ae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 18 Oct 2024 17:47:49 +0300 Subject: PCI: Store all PCIe Supported Link Speeds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCIe bandwidth controller added by a subsequent commit will require selecting PCIe Link Speeds that are lower than the Maximum Link Speed. The struct pci_bus only stores max_bus_speed. Even if PCIe r6.1 sec 8.2.1 currently disallows gaps in supported Link Speeds, the Implementation Note in PCIe r6.1 sec 7.5.3.18, recommends determining supported Link Speeds using the Supported Link Speeds Vector in the Link Capabilities 2 Register (when available) to "avoid software being confused if a future specification defines Links that do not require support for all slower speeds." Reuse code in pcie_get_speed_cap() to add pcie_get_supported_speeds() to query the Supported Link Speeds Vector of a PCIe device. The value is taken directly from the Supported Link Speeds Vector or synthesized from the Max Link Speed in the Link Capabilities Register when the Link Capabilities 2 Register is not available. The Supported Link Speeds Vector in the Link Capabilities Register 2 corresponds to the bus below on Root Ports and Downstream Ports, whereas it corresponds to the bus above on Upstream Ports and Endpoints (PCIe r6.1 sec 7.5.3.18): Supported Link Speeds Vector - This field indicates the supported Link speed(s) of the associated Port. Add supported_speeds into the struct pci_dev that caches the Supported Link Speeds Vector. supported_speeds contains a set of Link Speeds only in the case where PCIe Link Speed can be determined. Root Complex Integrated Endpoints do not have a well-defined Link Speed because they do not implement either of the Link Capabilities Registers, which is allowed by PCIe r6.1 sec 7.5.3 (the same limitation applies to determining cur_bus_speed and max_bus_speed that are PCI_SPEED_UNKNOWN in such case). This is of no concern from PCIe bandwidth controller point of view because such devices are not attached into a PCIe Root Port that could be controlled. The supported_speeds field keeps the extra reserved zero at the least significant bit to match the Link Capabilities 2 Register layout. An attempt was made to store supported_speeds field into the struct pci_bus as an intersection of both ends of the Link, however, the subordinate struct pci_bus is not available early enough. The Target Speed quirk (in pcie_failed_link_retrain()) can run either during initial scan or later, requiring it to use the API provided by the PCIe bandwidth controller to set the Target Link Speed in order to co-exist with the bandwidth controller. When the Target Speed quirk is calling the bandwidth controller during initial scan, the struct pci_bus is not yet initialized. As such, storing supported_speeds into the struct pci_bus is not viable. Suggested-by: Lukas Wunner Link: https://lore.kernel.org/r/20241018144755.7875-4-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: move pcie_get_supported_speeds() decl to drivers/pci/pci.h] Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron --- include/linux/pci.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index be5ed534c39c..99c6fa30d25b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -318,7 +318,14 @@ struct pci_sriov; struct pci_p2pdma; struct rcec_ea; -/* The pci_dev structure describes PCI devices */ +/* struct pci_dev - describes a PCI device + * + * @supported_speeds: PCIe Supported Link Speeds Vector (+ reserved 0 at + * LSB). 0 when the supported speeds cannot be + * determined (e.g., for Root Complex Integrated + * Endpoints without the relevant Capability + * Registers). + */ struct pci_dev { struct list_head bus_list; /* Node in per-bus list */ struct pci_bus *bus; /* Bus this device is on */ @@ -522,6 +529,7 @@ struct pci_dev { struct npem *npem; /* Native PCIe Enclosure Management */ #endif u16 acs_cap; /* ACS Capability offset */ + u8 supported_speeds; /* Supported Link Speeds Vector */ phys_addr_t rom; /* Physical address if not from BAR */ size_t romlen; /* Length if not from BAR */ /* -- cgit v1.2.3 From 38a18dfe9035d5a02a53271824de1854129c61dc Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 22 Oct 2024 15:48:51 -0700 Subject: PCI: Unexport pci_walk_bus_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's only one user of pci_walk_bus_locked(), and it's internal to the PCI core. Unexport it and make it private to drivers/pci/. Link: https://lore.kernel.org/r/20241022224851.340648-6-kbusch@meta.com Signed-off-by: Keith Busch [bhelgaas: move decl to drivers/pci/pci.h] Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Ilpo Järvinen --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 573b4c4c2be6..056290377273 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1623,8 +1623,6 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata); -void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), - void *userdata); int pci_cfg_space_size(struct pci_dev *dev); unsigned char pci_bus_max_busnr(struct pci_bus *bus); void pci_setup_bridge(struct pci_bus *bus); -- cgit v1.2.3 From fb56007c9bc38550755a53f7afd71a287340b724 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 6 Nov 2024 06:10:19 +0000 Subject: vt_buffer.h: get rid of dead code in default scr_...() instances Only 4 architectures define VT_BUF_HAVE_RW (alpha, mips, powerpc, sparc) and all of them define VT_BUF_HAVE_MEM{SET,CPY,MOVE}W. In other words, the code under #ifdef VT_BUF_HAVE_RW in default scr_mem...w() instances won't be compiled anyway. Signed-off-by: Al Viro Signed-off-by: Arnd Bergmann --- include/linux/vt_buffer.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h index 919d999a8c1d..b6eeb8cb6070 100644 --- a/include/linux/vt_buffer.h +++ b/include/linux/vt_buffer.h @@ -28,45 +28,21 @@ #ifndef VT_BUF_HAVE_MEMSETW static inline void scr_memsetw(u16 *s, u16 c, unsigned int count) { -#ifdef VT_BUF_HAVE_RW - count /= 2; - while (count--) - scr_writew(c, s++); -#else memset16(s, c, count / 2); -#endif } #endif #ifndef VT_BUF_HAVE_MEMCPYW static inline void scr_memcpyw(u16 *d, const u16 *s, unsigned int count) { -#ifdef VT_BUF_HAVE_RW - count /= 2; - while (count--) - scr_writew(scr_readw(s++), d++); -#else memcpy(d, s, count); -#endif } #endif #ifndef VT_BUF_HAVE_MEMMOVEW static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count) { -#ifdef VT_BUF_HAVE_RW - if (d < s) - scr_memcpyw(d, s, count); - else { - count /= 2; - d += count; - s += count; - while (count--) - scr_writew(scr_readw(--s), --d); - } -#else memmove(d, s, count); -#endif } #endif -- cgit v1.2.3 From 82ee16cfb290ae259d1cd6658a6988b430258e94 Mon Sep 17 00:00:00 2001 From: Karel Balej Date: Sat, 12 Oct 2024 21:31:39 +0200 Subject: rtc: add driver for Marvell 88PM886 PMIC RTC RTC lives on the chip's base register page. Add the relevant register definitions and implement a basic set/read time functionality. Tested with the samsung,coreprimevelte smartphone which contains this PMIC and whose vendor kernel tree has also served as the sole reference for this. Signed-off-by: Karel Balej Acked-by: Lee Jones Link: https://lore.kernel.org/r/20241012193345.18594-2-balejk@matfyz.cz Signed-off-by: Alexandre Belloni --- include/linux/mfd/88pm886.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/88pm886.h b/include/linux/mfd/88pm886.h index 133aa302e492..85eca44f39ab 100644 --- a/include/linux/mfd/88pm886.h +++ b/include/linux/mfd/88pm886.h @@ -31,6 +31,15 @@ #define PM886_INT_WC BIT(1) #define PM886_INT_MASK_MODE BIT(2) +#define PM886_REG_RTC_CNT1 0xd1 +#define PM886_REG_RTC_CNT2 0xd2 +#define PM886_REG_RTC_CNT3 0xd3 +#define PM886_REG_RTC_CNT4 0xd4 +#define PM886_REG_RTC_SPARE1 0xea +#define PM886_REG_RTC_SPARE2 0xeb +#define PM886_REG_RTC_SPARE3 0xec +#define PM886_REG_RTC_SPARE4 0xed +#define PM886_REG_RTC_SPARE5 0xee #define PM886_REG_RTC_SPARE6 0xef #define PM886_REG_BUCK_EN 0x08 -- cgit v1.2.3 From a7306f3c283bfe03611229bb6280987aae2af8f9 Mon Sep 17 00:00:00 2001 From: Nataniel Farzan Date: Mon, 4 Nov 2024 19:22:32 -0800 Subject: Improve consistency of '#error' directive messages Remove the use of contractions and use proper punctuation in #error directive messages that discourage the direct inclusion of header files. Link: https://lkml.kernel.org/r/20241105032231.28833-1-natanielfarzan@gmail.com Signed-off-by: Nataniel Farzan Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 2 +- include/linux/compiler-gcc.h | 2 +- include/linux/pm_wakeup.h | 2 +- include/linux/rwlock.h | 2 +- include/linux/rwlock_api_smp.h | 2 +- include/linux/spinlock_api_smp.h | 2 +- include/linux/spinlock_types_up.h | 2 +- include/linux/spinlock_up.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 4c1a39dcb624..2e7c2c282f3a 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPILER_TYPES_H -#error "Please don't include directly, include instead." +#error "Please do not include directly, include instead." #endif /* Compiler specific definitions for Clang compiler */ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index cd6f9aae311f..d0ed9583743f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPILER_TYPES_H -#error "Please don't include directly, include instead." +#error "Please do not include directly, include instead." #endif /* diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 76cd1f9f1365..222f7530806c 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -10,7 +10,7 @@ #define _LINUX_PM_WAKEUP_H #ifndef _DEVICE_H_ -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #include diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index c0ef596f340b..5b87c6f4a243 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -2,7 +2,7 @@ #define __LINUX_RWLOCK_H #ifndef __LINUX_INSIDE_SPINLOCK_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif /* diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index dceb0a59b692..31d3d1116323 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -2,7 +2,7 @@ #define __LINUX_RWLOCK_API_SMP_H #ifndef __LINUX_SPINLOCK_API_SMP_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif /* diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 89eb6f4c659c..9ecb0ab504e3 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -2,7 +2,7 @@ #define __LINUX_SPINLOCK_API_SMP_H #ifndef __LINUX_INSIDE_SPINLOCK_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif /* diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h index 7f86a2016ac5..fc4e2d017c20 100644 --- a/include/linux/spinlock_types_up.h +++ b/include/linux/spinlock_types_up.h @@ -2,7 +2,7 @@ #define __LINUX_SPINLOCK_TYPES_UP_H #ifndef __LINUX_SPINLOCK_TYPES_RAW_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif /* diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index c87204247592..1e84e71ca495 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -2,7 +2,7 @@ #define __LINUX_SPINLOCK_UP_H #ifndef __LINUX_INSIDE_SPINLOCK_H -# error "please don't include this file directly" +# error "Please do not include this file directly." #endif #include /* for cpu_relax() */ -- cgit v1.2.3 From bc73b4186736341ab5cd2c199da82db6e1134e13 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Tue, 5 Nov 2024 16:54:05 +0200 Subject: util_macros.h: fix/rework find_closest() macros A bug was found in the find_closest() (find_closest_descending() is also affected after some testing), where for certain values with small progressions, the rounding (done by averaging 2 values) causes an incorrect index to be returned. The rounding issues occur for progressions of 1, 2 and 3. It goes away when the progression/interval between two values is 4 or larger. It's particularly bad for progressions of 1. For example if there's an array of 'a = { 1, 2, 3 }', using 'find_closest(2, a ...)' would return 0 (the index of '1'), rather than returning 1 (the index of '2'). This means that for exact values (with a progression of 1), find_closest() will misbehave and return the index of the value smaller than the one we're searching for. For progressions of 2 and 3, the exact values are obtained correctly; but values aren't approximated correctly (as one would expect). Starting with progressions of 4, all seems to be good (one gets what one would expect). While one could argue that 'find_closest()' should not be used for arrays with progressions of 1 (i.e. '{1, 2, 3, ...}', the macro should still behave correctly. The bug was found while testing the 'drivers/iio/adc/ad7606.c', specifically the oversampling feature. For reference, the oversampling values are listed as: static const unsigned int ad7606_oversampling_avail[7] = { 1, 2, 4, 8, 16, 32, 64, }; When doing: 1. $ echo 1 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio 1 # this is fine 2. $ echo 2 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio 1 # this is wrong; 2 should be returned here 3. $ echo 3 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio 2 # this is fine 4. $ echo 4 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio $ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio 4 # this is fine And from here-on, the values are as correct (one gets what one would expect.) While writing a kunit test for this bug, a peculiar issue was found for the array in the 'drivers/hwmon/ina2xx.c' & 'drivers/iio/adc/ina2xx-adc.c' drivers. While running the kunit test (for 'ina226_avg_tab' from these drivers): * idx = find_closest([-1 to 2], ina226_avg_tab, ARRAY_SIZE(ina226_avg_tab)); This returns idx == 0, so value. * idx = find_closest(3, ina226_avg_tab, ARRAY_SIZE(ina226_avg_tab)); This returns idx == 0, value 1; and now one could argue whether 3 is closer to 4 or to 1. This quirk only appears for value '3' in this array, but it seems to be a another rounding issue. * And from 4 onwards the 'find_closest'() works fine (one gets what one would expect). This change reworks the find_closest() macros to also check the difference between the left and right elements when 'x'. If the distance to the right is smaller (than the distance to the left), the index is incremented by 1. This also makes redundant the need for using the DIV_ROUND_CLOSEST() macro. In order to accommodate for any mix of negative + positive values, the internal variables '__fc_x', '__fc_mid_x', '__fc_left' & '__fc_right' are forced to 'long' type. This also addresses any potential bugs/issues with 'x' being of an unsigned type. In those situations any comparison between signed & unsigned would be promoted to a comparison between 2 unsigned numbers; this is especially annoying when '__fc_left' & '__fc_right' underflow. The find_closest_descending() macro was also reworked and duplicated from the find_closest(), and it is being iterated in reverse. The main reason for this is to get the same indices as 'find_closest()' (but in reverse). The comparison for '__fc_right < __fc_left' favors going the array in ascending order. For example for array '{ 1024, 512, 256, 128, 64, 16, 4, 1 }' and x = 3, we get: __fc_mid_x = 2 __fc_left = -1 __fc_right = -2 Then '__fc_right < __fc_left' evaluates to true and '__fc_i++' becomes 7 which is not quite incorrect, but 3 is closer to 4 than to 1. This change has been validated with the kunit from the next patch. Link: https://lkml.kernel.org/r/20241105145406.554365-1-aardelean@baylibre.com Fixes: 95d119528b0b ("util_macros.h: add find_closest() macro") Signed-off-by: Alexandru Ardelean Cc: Bartosz Golaszewski Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton --- include/linux/util_macros.h | 56 ++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 6bb460c3e818..825487fb66fa 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -4,19 +4,6 @@ #include -#define __find_closest(x, a, as, op) \ -({ \ - typeof(as) __fc_i, __fc_as = (as) - 1; \ - typeof(x) __fc_x = (x); \ - typeof(*a) const *__fc_a = (a); \ - for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) { \ - if (__fc_x op DIV_ROUND_CLOSEST(__fc_a[__fc_i] + \ - __fc_a[__fc_i + 1], 2)) \ - break; \ - } \ - (__fc_i); \ -}) - /** * find_closest - locate the closest element in a sorted array * @x: The reference value. @@ -25,8 +12,27 @@ * @as: Size of 'a'. * * Returns the index of the element closest to 'x'. + * Note: If using an array of negative numbers (or mixed positive numbers), + * then be sure that 'x' is of a signed-type to get good results. */ -#define find_closest(x, a, as) __find_closest(x, a, as, <=) +#define find_closest(x, a, as) \ +({ \ + typeof(as) __fc_i, __fc_as = (as) - 1; \ + long __fc_mid_x, __fc_x = (x); \ + long __fc_left, __fc_right; \ + typeof(*a) const *__fc_a = (a); \ + for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) { \ + __fc_mid_x = (__fc_a[__fc_i] + __fc_a[__fc_i + 1]) / 2; \ + if (__fc_x <= __fc_mid_x) { \ + __fc_left = __fc_x - __fc_a[__fc_i]; \ + __fc_right = __fc_a[__fc_i + 1] - __fc_x; \ + if (__fc_right < __fc_left) \ + __fc_i++; \ + break; \ + } \ + } \ + (__fc_i); \ +}) /** * find_closest_descending - locate the closest element in a sorted array @@ -36,9 +42,27 @@ * @as: Size of 'a'. * * Similar to find_closest() but 'a' is expected to be sorted in descending - * order. + * order. The iteration is done in reverse order, so that the comparison + * of '__fc_right' & '__fc_left' also works for unsigned numbers. */ -#define find_closest_descending(x, a, as) __find_closest(x, a, as, >=) +#define find_closest_descending(x, a, as) \ +({ \ + typeof(as) __fc_i, __fc_as = (as) - 1; \ + long __fc_mid_x, __fc_x = (x); \ + long __fc_left, __fc_right; \ + typeof(*a) const *__fc_a = (a); \ + for (__fc_i = __fc_as; __fc_i >= 1; __fc_i--) { \ + __fc_mid_x = (__fc_a[__fc_i] + __fc_a[__fc_i - 1]) / 2; \ + if (__fc_x <= __fc_mid_x) { \ + __fc_left = __fc_x - __fc_a[__fc_i]; \ + __fc_right = __fc_a[__fc_i - 1] - __fc_x; \ + if (__fc_right < __fc_left) \ + __fc_i--; \ + break; \ + } \ + } \ + (__fc_i); \ +}) /** * is_insidevar - check if the @ptr points inside the @var memory range. -- cgit v1.2.3 From 3f28bbe56c7b77e73f1dd0515cad009cfdd64962 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:52 +0800 Subject: mm/list_lru: don't pass unnecessary key parameters Patch series "mm/list_lru: Split list_lru lock into per-cgroup scope". When LOCKDEP is not enabled, lock_class_key is an empty struct that is never used. But the list_lru initialization function still takes a placeholder pointer as parameter, and the compiler cannot optimize it because the function is not static and exported. Remove this parameter and move it inside the list_lru struct. Only use it when LOCKDEP is enabled. Kernel builds with LOCKDEP will be slightly larger, while !LOCKDEP builds without it will be slightly smaller (the common case). Link: https://lkml.kernel.org/r/20241104175257.60853-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20241104175257.60853-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Shakeel Butt Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Waiman Long Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 5099a8ccd5f4..eba93f6511f3 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -56,16 +56,28 @@ struct list_lru { bool memcg_aware; struct xarray xa; #endif +#ifdef CONFIG_LOCKDEP + struct lock_class_key *key; +#endif }; void list_lru_destroy(struct list_lru *lru); int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key, struct shrinker *shrinker); + struct shrinker *shrinker); #define list_lru_init(lru) \ - __list_lru_init((lru), false, NULL, NULL) + __list_lru_init((lru), false, NULL) #define list_lru_init_memcg(lru, shrinker) \ - __list_lru_init((lru), true, NULL, shrinker) + __list_lru_init((lru), true, shrinker) + +static inline int list_lru_init_memcg_key(struct list_lru *lru, struct shrinker *shrinker, + struct lock_class_key *key) +{ +#ifdef CONFIG_LOCKDEP + lru->key = key; +#endif + return list_lru_init_memcg(lru, shrinker); +} int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); -- cgit v1.2.3 From fb56fdf8b9a2f7397f8a83dce50189f3f0cf71af Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:56 +0800 Subject: mm/list_lru: split the lock to per-cgroup scope Currently, every list_lru has a per-node lock that protects adding, deletion, isolation, and reparenting of all list_lru_one instances belonging to this list_lru on this node. This lock contention is heavy when multiple cgroups modify the same list_lru. This lock can be split into per-cgroup scope to reduce contention. To achieve this, we need a stable list_lru_one for every cgroup. This commit adds a lock to each list_lru_one and introduced a helper function lock_list_lru_of_memcg, making it possible to pin the list_lru of a memcg. Then reworked the reparenting process. Reparenting will switch the list_lru_one instances one by one. By locking each instance and marking it dead using the nr_items counter, reparenting ensures that all items in the corresponding cgroup (on-list or not, because items have a stable cgroup, see below) will see the list_lru_one switch synchronously. Objcg reparent is also moved after list_lru reparent so items will have a stable mem cgroup until all list_lru_one instances are drained. The only caller that doesn't work the *_obj interfaces are direct calls to list_lru_{add,del}. But it's only used by zswap and that's also based on objcg, so it's fine. This also changes the bahaviour of the isolation function when LRU_RETRY or LRU_REMOVED_RETRY is returned, because now releasing the lock could unblock reparenting and free the list_lru_one, isolation function will have to return withoug re-lock the lru. prepare() { mkdir /tmp/test-fs modprobe brd rd_nr=1 rd_size=33554432 mkfs.xfs -f /dev/ram0 mount -t xfs /dev/ram0 /tmp/test-fs for i in $(seq 1 512); do mkdir "/tmp/test-fs/$i" for j in $(seq 1 10240); do echo TEST-CONTENT > "/tmp/test-fs/$i/$j" done & done; wait } do_test() { read_worker() { sleep 1 tar -cv "$1" &>/dev/null } read_in_all() { cd "/tmp/test-fs" && ls for i in $(seq 1 512); do (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" read_worker "$i" & done; wait } for i in $(seq 1 512); do mkdir -p "/sys/fs/cgroup/benchmark/$i" done echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control echo 512M > /sys/fs/cgroup/benchmark/memory.max echo 3 > /proc/sys/vm/drop_caches time read_in_all } Above script simulates compression of small files in multiple cgroups with memory pressure. Run prepare() then do_test for 6 times: Before: real 0m7.762s user 0m11.340s sys 3m11.224s real 0m8.123s user 0m11.548s sys 3m2.549s real 0m7.736s user 0m11.515s sys 3m11.171s real 0m8.539s user 0m11.508s sys 3m7.618s real 0m7.928s user 0m11.349s sys 3m13.063s real 0m8.105s user 0m11.128s sys 3m14.313s After this commit (about ~15% faster): real 0m6.953s user 0m11.327s sys 2m42.912s real 0m7.453s user 0m11.343s sys 2m51.942s real 0m6.916s user 0m11.269s sys 2m43.957s real 0m6.894s user 0m11.528s sys 2m45.346s real 0m6.911s user 0m11.095s sys 2m43.168s real 0m6.773s user 0m11.518s sys 2m40.774s Link: https://lkml.kernel.org/r/20241104175257.60853-6-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index eba93f6511f3..10ba9a54d42c 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -32,6 +32,8 @@ struct list_lru_one { struct list_head list; /* may become negative during memcg reparenting */ long nr_items; + /* protects all fields above */ + spinlock_t lock; }; struct list_lru_memcg { @@ -41,11 +43,9 @@ struct list_lru_memcg { }; struct list_lru_node { - /* protects all lists on the node, including per cgroup */ - spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; - long nr_items; + atomic_long_t nr_items; } ____cacheline_aligned_in_smp; struct list_lru { -- cgit v1.2.3 From da0c02516c501b43bd39ad4aca5779c86153d143 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:57 +0800 Subject: mm/list_lru: simplify the list_lru walk callback function Now isolation no longer takes the list_lru global node lock, only use the per-cgroup lock instead. And this lock is inside the list_lru_one being walked, no longer needed to pass the lock explicitly. Link: https://lkml.kernel.org/r/20241104175257.60853-7-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 10ba9a54d42c..05c166811f6b 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -184,7 +184,7 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, - struct list_lru_one *list, spinlock_t *lock, void *cb_arg); + struct list_lru_one *list, void *cb_arg); /** * list_lru_walk_one: walk a @lru, isolating and disposing freeable items. -- cgit v1.2.3 From 7591c127f3b17d5879f18819cad7058bf3a2e276 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 4 Nov 2024 11:19:44 +0000 Subject: kmemleak: iommu/iova: fix transient kmemleak false positive The introduction of iova_depot_pop() in 911aa1245da8 ("iommu/iova: Make the rcache depot scale better") confused kmemleak by moving a struct iova_magazine object from a singly linked list to rcache->depot and resetting the 'next' pointer referencing it. Unlike doubly linked lists, the content of the object being referred is never changed on removal from a singly linked list and the kmemleak checksum heuristics do not detect such scenario. This leads to false positives like: unreferenced object 0xffff8881a5301000 (size 1024): comm "softirq", pid 0, jiffies 4306297099 (age 462.991s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}...... 0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................ backtrace: [] __kmem_cache_alloc_node+0x1e8/0x320 [] kmalloc_trace+0x2a/0x60 [] free_iova_fast+0x28e/0x4e0 [] fq_ring_free_locked+0x1b0/0x310 [] fq_flush_timeout+0x19d/0x2e0 [] call_timer_fn+0x19a/0x5c0 [] __run_timers+0x78b/0xb80 [] run_timer_softirq+0x5d/0xd0 [] __do_softirq+0x205/0x8b5 Introduce kmemleak_transient_leak() which resets the object checksum requiring another scan pass before it is reported (if still unreferenced). Call this new API in iova_depot_pop(). Link: https://lkml.kernel.org/r/20241104111944.2207155-1-catalin.marinas@arm.com Link: https://lore.kernel.org/r/ZY1osaGLyT-sdKE8@shredder/ Signed-off-by: Catalin Marinas Reported-by: Ido Schimmel Tested-by: Ido Schimmel Acked-by: Robin Murphy Cc: Joerg Roedel Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/kmemleak.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 6a3cd1bf4680..93a73c076d16 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -26,6 +26,7 @@ extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; extern void kmemleak_update_trace(const void *ptr) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; +extern void kmemleak_transient_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; @@ -93,6 +94,9 @@ static inline void kmemleak_update_trace(const void *ptr) static inline void kmemleak_not_leak(const void *ptr) { } +static inline void kmemleak_transient_leak(const void *ptr) +{ +} static inline void kmemleak_ignore(const void *ptr) { } -- cgit v1.2.3 From 7269ed4af344184ab9bdf318fe8864cf64849735 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 4 Nov 2024 15:07:12 +0800 Subject: mm: define general function pXd_init() pud_init(), pmd_init() and kernel_pte_init() are duplicated defined in file kasan.c and sparse-vmemmap.c as weak functions. Move them to generic header file pgtable.h, architecture can redefine them. Link: https://lkml.kernel.org/r/20241104070712.52902-1-maobibo@loongson.cn Signed-off-by: Bibo Mao Reviewed-by: Huacai Chen Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Thomas Bogendoerfer Cc: Vincenzo Frascino Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 --- include/linux/pgtable.h | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 32888a97ab44..5d6cd523c7c0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3819,9 +3819,6 @@ void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); -void pud_init(void *addr); -void pmd_init(void *addr); -void kernel_pte_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 23aeffd89a4e..adef9d6e9b1b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -90,6 +90,27 @@ static inline unsigned long pud_index(unsigned long address) #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif +#ifndef kernel_pte_init +static inline void kernel_pte_init(void *addr) +{ +} +#define kernel_pte_init kernel_pte_init +#endif + +#ifndef pmd_init +static inline void pmd_init(void *addr) +{ +} +#define pmd_init pmd_init +#endif + +#ifndef pud_init +static inline void pud_init(void *addr) +{ +} +#define pud_init pud_init +#endif + #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { -- cgit v1.2.3 From 5dc51ec86df6e2214d8398079c1e31736593ab53 Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:31 +0000 Subject: net: Add napi_struct parameter irq_suspend_timeout Add a per-NAPI IRQ suspension parameter, which can be get/set with netdev-genl. This patch doesn't change any behavior but prepares the code for other changes in the following commits which use irq_suspend_timeout as a timeout for IRQ suspension. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df4483598628..0aae346d919e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -348,6 +348,7 @@ struct gro_list { */ struct napi_config { u64 gro_flush_timeout; + u64 irq_suspend_timeout; u32 defer_hard_irqs; unsigned int napi_id; }; @@ -384,6 +385,7 @@ struct napi_struct { struct hrtimer timer; struct task_struct *thread; unsigned long gro_flush_timeout; + unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; -- cgit v1.2.3 From 213a695297e1f0c2ed814488757d496b0d7f7267 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 11 Nov 2024 20:49:11 +0800 Subject: bpf: Replace the document for PTR_TO_BTF_ID_OR_NULL Commit c25b2ae13603 ("bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL") moved the fields around and misplaced the documentation for "PTR_TO_BTF_ID_OR_NULL". So, let's replace it in the proper place. Signed-off-by: Menglong Dong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241111124911.1436911-1-dongml2@chinatelecom.cn --- include/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1b84613b10ac..7da41ae2eac8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -932,10 +932,6 @@ enum bpf_reg_type { * additional context, assume the value is non-null. */ PTR_TO_BTF_ID, - /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not - * been checked for null. Used primarily to inform the verifier - * an explicit null check is required for this struct. - */ PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ @@ -948,6 +944,10 @@ enum bpf_reg_type { PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, + /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not + * been checked for null. Used primarily to inform the verifier + * an explicit null check is required for this struct. + */ PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is -- cgit v1.2.3 From 12079a59ce52e72a342c49cfacf0281213fd6f32 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 7 Nov 2024 08:11:44 -0800 Subject: net: Implement fault injection forcing skb reallocation Introduce a fault injection mechanism to force skb reallocation. The primary goal is to catch bugs related to pointer invalidation after potential skb reallocation. The fault injection mechanism aims to identify scenarios where callers retain pointers to various headers in the skb but fail to reload these pointers after calling a function that may reallocate the data. This type of bug can lead to memory corruption or crashes if the old, now-invalid pointers are used. By forcing reallocation through fault injection, we can stress-test code paths and ensure proper pointer management after potential skb reallocations. Add a hook for fault injection in the following functions: * pskb_trim_rcsum() * pskb_may_pull_reason() * pskb_trim() As the other fault injection mechanism, protect it under a debug Kconfig called CONFIG_FAIL_SKB_REALLOC. This patch was *heavily* inspired by Jakub's proposal from: https://lore.kernel.org/all/20240719174140.47a868e6@kernel.org/ CC: Akinobu Mita Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Akinobu Mita Acked-by: Paolo Abeni Acked-by: Guillaume Nault Link: https://patch.msgid.link/20241107-fault_v6-v6-1-1b82cb6ecacd@debian.org Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 60535c706851..58009fa66102 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2682,6 +2682,12 @@ static inline void skb_assert_len(struct sk_buff *skb) #endif /* CONFIG_DEBUG_NET */ } +#if defined(CONFIG_FAIL_SKB_REALLOC) +void skb_might_realloc(struct sk_buff *skb); +#else +static inline void skb_might_realloc(struct sk_buff *skb) {} +#endif + /* * Add data to an sk_buff */ @@ -2782,6 +2788,7 @@ static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_might_realloc(skb); if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; @@ -3240,6 +3247,7 @@ static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) static inline int pskb_trim(struct sk_buff *skb, unsigned int len) { + skb_might_realloc(skb); return (len < skb->len) ? __pskb_trim(skb, len) : 0; } @@ -3994,6 +4002,7 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { + skb_might_realloc(skb); if (likely(len >= skb->len)) return 0; return pskb_trim_rcsum_slow(skb, len); -- cgit v1.2.3 From 406c5548c661df0bff6bb6ee79bf9d49faf23e31 Mon Sep 17 00:00:00 2001 From: MeiChia Chiu Date: Tue, 12 Nov 2024 16:38:46 +0800 Subject: wifi: mac80211: Support EHT 1024 aggregation size in TX Support EHT 1024 aggregation size in TX The 1024 agg size for RX is supported but not for TX. This patch adds this support and refactors common parsing logics for addbaext in both process_addba_resp and process_addba_req into a function. Reviewed-by: Shayne Chen Reviewed-by: Money Wang Co-developed-by: Peter Chiu Signed-off-by: Peter Chiu Signed-off-by: MeiChia Chiu Link: https://patch.msgid.link/20241112083846.32063-1-MeiChia.Chiu@mediatek.com [pass elems/len instead of mgmt/len/is_req] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 456bca45ff05..05dedc45505c 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1445,6 +1445,8 @@ struct ieee80211_mgmt { __le16 status; __le16 capab; __le16 timeout; + /* followed by BA Extension */ + u8 variable[]; } __packed addba_resp; struct{ u8 action_code; -- cgit v1.2.3 From 8182a8b39aa227f5b99b8d4d18f296b82ce4b94c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Nov 2024 06:43:55 +0100 Subject: writeback: wbc_attach_fdatawrite_inode out of line This allows exporting this high-level interface only while keeping wbc_attach_and_unlock_inode private in fs-writeback.c and unexporting __inode_attach_wb. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241112054403.1470586-3-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/writeback.h | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d6db822e4bb3..aee3e1b4c50f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -213,9 +213,6 @@ static inline void wait_on_inode(struct inode *inode) #include void __inode_attach_wb(struct inode *inode, struct folio *folio); -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) - __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); @@ -254,22 +251,8 @@ static inline void inode_detach_wb(struct inode *inode) } } -/** - * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite - * @wbc: writeback_control of interest - * @inode: target inode - * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. - */ -static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, - struct inode *inode) -{ - spin_lock(&inode->i_lock); - inode_attach_wb(inode, NULL); - wbc_attach_and_unlock_inode(wbc, inode); -} +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode); /** * wbc_init_bio - writeback specific initializtion of bio @@ -303,13 +286,6 @@ static inline void inode_detach_wb(struct inode *inode) { } -static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) - __releases(&inode->i_lock) -{ - spin_unlock(&inode->i_lock); -} - static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { -- cgit v1.2.3 From 365f34483be33a9d0151c06ac39627d7927210d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:00 -0700 Subject: srcu: Renaming in preparation for additional reader flavor Currently, there are only two flavors of readers, normal and NMI-safe. A number of fields, functions, and types reflect this restriction. This renaming-only commit prepares for the addition of light-weight (as in memory-barrier-free) readers. OK, OK, there is also a drive-by white-space fixeup! Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 21 ++++++++++----------- include/linux/srcutree.h | 2 +- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 835bbb2d1f88..06728ef6f32a 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -181,10 +181,9 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #define SRCU_NMI_SAFE 0x2 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) -void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe); +void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); #else -static inline void srcu_check_nmi_safety(struct srcu_struct *ssp, - bool nmi_safe) { } +static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { } #endif @@ -245,7 +244,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); retval = __srcu_read_lock(ssp); srcu_lock_acquire(&ssp->dep_map); return retval; @@ -262,7 +261,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp { int retval; - srcu_check_nmi_safety(ssp, true); + srcu_check_read_flavor(ssp, true); retval = __srcu_read_lock_nmisafe(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -274,7 +273,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); retval = __srcu_read_lock(ssp); return retval; } @@ -303,7 +302,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(in_nmi()); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); return __srcu_read_lock(ssp); } @@ -318,7 +317,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } @@ -334,7 +333,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_nmi_safety(ssp, true); + srcu_check_read_flavor(ssp, true); rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } @@ -343,7 +342,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); __srcu_read_unlock(ssp, idx); } @@ -360,7 +359,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); WARN_ON_ONCE(in_nmi()); - srcu_check_nmi_safety(ssp, false); + srcu_check_read_flavor(ssp, false); __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index ed57598394de..ab7d8d215b84 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -25,7 +25,7 @@ struct srcu_data { /* Read-side state. */ atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ - int srcu_nmi_safety; /* NMI-safe srcu_struct structure? */ + int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; -- cgit v1.2.3 From c071b8e5351453c2cb2d12f425d928f3a24ed2d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:03 -0700 Subject: srcu: Improve srcu_read_lock{,_nmisafe}() comments This commit adds some additional usage constraints to the kernel-doc headers of srcu_read_lock() and srcu_read_lock_nmi_safe(). Suggested-by: Andrii Nakryiko Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 06728ef6f32a..46fd06b212ba 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -235,10 +235,13 @@ static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flav * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * - * Note that srcu_read_lock() and the matching srcu_read_unlock() must - * occur in the same context, for example, it is illegal to invoke - * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() - * was invoked in process context. + * The return value from srcu_read_lock() must be passed unaltered + * to the matching srcu_read_unlock(). Note that srcu_read_lock() and + * the matching srcu_read_unlock() must occur in the same context, for + * example, it is illegal to invoke srcu_read_unlock() in an irq handler + * if the matching srcu_read_lock() was invoked in process context. Or, + * for that matter to invoke srcu_read_unlock() from one task and the + * matching srcu_read_lock() from another. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { @@ -256,6 +259,10 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) * * Enter an SRCU read-side critical section, but in an NMI-safe manner. * See srcu_read_lock() for more information. + * + * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. */ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp) { -- cgit v1.2.3 From 05829be27fe6f64e0675dc3be3a12d43b52492e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:04 -0700 Subject: srcu: Create CPP macros for normal and NMI-safe SRCU readers This commit creates SRCU_READ_FLAVOR_NORMAL and SRCU_READ_FLAVOR_NMI C-preprocessor macros for srcu_read_lock() and srcu_read_lock_nmisafe(), respectively. These replace the old true/false values that were previously passed to srcu_check_read_flavor(). In addition, the srcu_check_read_flavor() function itself requires a bit of rework to handle bitmasks instead of true/false values. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 20 ++++++++------------ include/linux/srcutree.h | 4 ++++ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 46fd06b212ba..84daaa33ea0a 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -176,10 +176,6 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#define SRCU_NMI_UNKNOWN 0x0 -#define SRCU_NMI_UNSAFE 0x1 -#define SRCU_NMI_SAFE 0x2 - #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); #else @@ -247,7 +243,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); retval = __srcu_read_lock(ssp); srcu_lock_acquire(&ssp->dep_map); return retval; @@ -268,7 +264,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp { int retval; - srcu_check_read_flavor(ssp, true); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI); retval = __srcu_read_lock_nmisafe(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -280,7 +276,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); retval = __srcu_read_lock(ssp); return retval; } @@ -309,7 +305,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(in_nmi()); - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); return __srcu_read_lock(ssp); } @@ -324,7 +320,7 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } @@ -340,7 +336,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); - srcu_check_read_flavor(ssp, true); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI); rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } @@ -349,7 +345,7 @@ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); __srcu_read_unlock(ssp, idx); } @@ -366,7 +362,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, int idx) { WARN_ON_ONCE(idx & ~0x1); WARN_ON_ONCE(in_nmi()); - srcu_check_read_flavor(ssp, false); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index ab7d8d215b84..79ad809c7f03 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -43,6 +43,10 @@ struct srcu_data { struct srcu_struct *ssp; }; +/* Values for ->srcu_reader_flavor. */ +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). + /* * Node in SRCU combining tree, similar in function to rcu_data. */ -- cgit v1.2.3 From 6364dd8191d27230176ac4b1b4daaecaf4807399 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:05 -0700 Subject: srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite() This patch adds srcu_read_lock_lite() and srcu_read_unlock_lite(), which dispense with the read-side smp_mb() but also are restricted to code regions that RCU is watching. If a given srcu_struct structure uses srcu_read_lock_lite() and srcu_read_unlock_lite(), it is not permitted to use any other SRCU read-side marker, before, during, or after. Another price of light-weight readers is heavier weight grace periods. Such readers mean that SRCU grace periods on srcu_struct structures used by light-weight readers will incur at least two calls to synchronize_rcu(). In addition, normal SRCU grace periods for light-weight-reader srcu_struct structures never auto-expedite. Note that expedited SRCU grace periods for light-weight-reader srcu_struct structures still invoke synchronize_rcu(), not synchronize_srcu_expedited(). Something about wishing to keep the IPIs down to a dull roar. The srcu_read_lock_lite() and srcu_read_unlock_lite() functions may not (repeat, *not*) be used from NMI handlers, but if this is needed, an additional flavor of SRCU reader can be added by some future commit. [ paulmck: Apply Alexei Starovoitov expediting feedback. ] [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++- include/linux/srcutree.h | 1 + 2 files changed, 51 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 84daaa33ea0a..4ba96e2cfa40 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -56,6 +56,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); +#ifdef CONFIG_TINY_SRCU +#define __srcu_read_lock_lite __srcu_read_lock +#define __srcu_read_unlock_lite __srcu_read_unlock +#else // #ifdef CONFIG_TINY_SRCU +int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp); +void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp); +#endif // #else // #ifdef CONFIG_TINY_SRCU void synchronize_srcu(struct srcu_struct *ssp); #define SRCU_GET_STATE_COMPLETED 0x1 @@ -179,7 +186,7 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); #else -static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { } +#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) #endif @@ -249,6 +256,32 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) return retval; } +/** + * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * + * If srcu_read_lock_lite() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. Note that grace-period auto-expediting is disabled for _lite + * srcu_struct structures because auto-expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. + * + * Note that srcu_read_lock_lite() can be invoked only from those contexts + * where RCU is watching. Otherwise, lockdep will complain. + */ +static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) +{ + int retval; + + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + retval = __srcu_read_lock_lite(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + /** * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -325,6 +358,22 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __srcu_read_unlock(ssp, idx); } +/** + * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @idx: return value from corresponding srcu_read_lock(). + * + * Exit a light-weight SRCU read-side critical section. + */ +static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) + __releases(ssp) +{ + WARN_ON_ONCE(idx & ~0x1); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock(ssp, idx); +} + /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 79ad809c7f03..8074138cbd62 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -46,6 +46,7 @@ struct srcu_data { /* Values for ->srcu_reader_flavor. */ #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). /* * Node in SRCU combining tree, similar in function to rcu_data. -- cgit v1.2.3 From bb94b12e4503bdce003a74e95ee4214eba923f86 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:06 -0700 Subject: srcu: Allow inlining of __srcu_read_{,un}lock_lite() This commit moves __srcu_read_lock_lite() and __srcu_read_unlock_lite() into include/linux/srcu.h and marks them "static inline" so that they can be inlined into srcu_read_lock_lite() and srcu_read_unlock_lite(), respectively. They are not hand-inlined due to Tree SRCU and Tiny SRCU having different implementations. The earlier removal of smp_mb() combined with the inlining produce significant single-percentage performance wins. Link: https://lore.kernel.org/all/CAEf4BzYgiNmSb=ZKQ65tm6nJDi1UX2Gq26cdHSH1mPwXJYZj5g@mail.gmail.com/ Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcutree.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8074138cbd62..778eb61542e1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -209,4 +209,43 @@ void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns an index that must be passed to the matching + * srcu_read_unlock_lite(). + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) +{ + int idx; + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); + idx = READ_ONCE(ssp->srcu_idx) & 0x1; + this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + barrier(); /* Avoid leaking the critical section. */ + return idx; +} + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_lite(), but it must be within the same task. + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) +{ + barrier(); /* Avoid leaking the critical section. */ + this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); +} + #endif -- cgit v1.2.3 From 768b1f87098a4a586353898b074989808b1b27ad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Oct 2024 09:11:12 -0700 Subject: srcu: Improve srcu_read_lock_lite() kernel-doc comment Where RCU is watching is where it is OK to invoke rcu_read_lock(). Reported-by: Andrii Nakryiko Signed-off-by: Paul E. McKenney Acked-by: Andrii Nakryiko Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4ba96e2cfa40..bab1dae3f69e 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -270,7 +270,8 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) * synchronize_rcu_expedited(), IPIs and all. * * Note that srcu_read_lock_lite() can be invoked only from those contexts - * where RCU is watching. Otherwise, lockdep will complain. + * where RCU is watching, that is, from contexts where it would be legal + * to invoke rcu_read_lock(). Otherwise, lockdep will complain. */ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { -- cgit v1.2.3 From 7d4f46c2372d5a850e28f63001013de50843b6e6 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:18 -0800 Subject: iommufd: Move _iommufd_object_alloc helper to a sharable file The following patch will add a new vIOMMU allocator that will require this _iommufd_object_alloc to be sharable with IOMMU drivers (and iommufd too). Add a new driver.c file that will be built with CONFIG_IOMMUFD_DRIVER_CORE selected by CONFIG_IOMMUFD, and put the CONFIG_DRIVER under that remaining to be selectable for drivers to build the existing iova_bitmap.c file. Link: https://patch.msgid.link/r/2f4f6e116dc49ffb67ff6c5e8a7a8e789ab9e98e.1730836219.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 22948dd03d67..94522d4029ca 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -135,4 +135,17 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) return -EOPNOTSUPP; } #endif /* CONFIG_IOMMUFD */ + +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type); +#else /* !CONFIG_IOMMUFD_DRIVER_CORE */ +static inline struct iommufd_object * +_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, + enum iommufd_object_type type) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif /* CONFIG_IOMMUFD_DRIVER_CORE */ #endif -- cgit v1.2.3 From 6b22d562fcd6e3d1cc1c265b0596840946d16a09 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:19 -0800 Subject: iommufd: Introduce IOMMUFD_OBJ_VIOMMU and its related struct Add a new IOMMUFD_OBJ_VIOMMU with an iommufd_viommu structure to represent a slice of physical IOMMU device passed to or shared with a user space VM. This slice, now a vIOMMU object, is a group of virtualization resources of a physical IOMMU's, such as: - Security namespace for guest owned ID, e.g. guest-controlled cache tags - Non-device-affiliated event reporting, e.g. invalidation queue errors - Access to a sharable nesting parent pagetable across physical IOMMUs - Virtualization of various platforms IDs, e.g. RIDs and others - Delivery of paravirtualized invalidation - Direct assigned invalidation queues - Direct assigned interrupts Add a new viommu_alloc op in iommu_ops, for drivers to allocate their own vIOMMU structures. And this allocation also needs a free(), so add struct iommufd_viommu_ops. To simplify a vIOMMU allocation, provide a iommufd_viommu_alloc() helper. It's suggested that a driver should embed a core-level viommu structure in its driver-level viommu struct and call the iommufd_viommu_alloc() helper, meanwhile the driver can also implement a viommu ops: struct my_driver_viommu { struct iommufd_viommu core; /* driver-owned properties/features */ .... }; static const struct iommufd_viommu_ops my_driver_viommu_ops = { .free = my_driver_viommu_free, /* future ops for virtualization features */ .... }; static struct iommufd_viommu my_driver_viommu_alloc(...) { struct my_driver_viommu *my_viommu = iommufd_viommu_alloc(ictx, my_driver_viommu, core, my_driver_viommu_ops); /* Init my_viommu and related HW feature */ .... return &my_viommu->core; } static struct iommu_domain_ops my_driver_domain_ops = { .... .viommu_alloc = my_driver_viommu_alloc, }; Link: https://patch.msgid.link/r/64685e2b79dea0f1dc56f6ede04809b72d578935.1730836219.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 14 ++++++++++++++ include/linux/iommufd.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bd722f473635..2574fc8abaf2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -42,6 +42,8 @@ struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; struct iommu_fault_param; +struct iommufd_ctx; +struct iommufd_viommu; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -542,6 +544,14 @@ static inline int __iommu_copy_struct_from_user_array( * @remove_dev_pasid: Remove any translation configurations of a specific * pasid, so that any DMA transactions with this pasid * will be blocked by the hardware. + * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind + * the @dev, as the set of virtualization resources shared/passed + * to user space IOMMU instance. And associate it with a nesting + * @parent_domain. The @viommu_type must be defined in the header + * include/uapi/linux/iommufd.h + * It is required to call iommufd_viommu_alloc() helper for + * a bundled allocation of the core and the driver structures, + * using the given @ictx pointer. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops * @identity_domain: An always available, always attachable identity @@ -591,6 +601,10 @@ struct iommu_ops { void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid, struct iommu_domain *domain); + struct iommufd_viommu *(*viommu_alloc)( + struct device *dev, struct iommu_domain *parent_domain, + struct iommufd_ctx *ictx, unsigned int viommu_type); + const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; struct module *owner; diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 94522d4029ca..4fc2ce332f10 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -17,6 +17,7 @@ struct iommu_group; struct iommufd_access; struct iommufd_ctx; struct iommufd_device; +struct iommufd_viommu_ops; struct page; enum iommufd_object_type { @@ -28,6 +29,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, IOMMUFD_OBJ_FAULT, + IOMMUFD_OBJ_VIOMMU, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -78,6 +80,26 @@ void iommufd_access_detach(struct iommufd_access *access); void iommufd_ctx_get(struct iommufd_ctx *ictx); +struct iommufd_viommu { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommu_device *iommu_dev; + struct iommufd_hwpt_paging *hwpt; + + const struct iommufd_viommu_ops *ops; + + unsigned int type; +}; + +/** + * struct iommufd_viommu_ops - vIOMMU specific operations + * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory + * of the vIOMMU will be free-ed by iommufd core after calling this op + */ +struct iommufd_viommu_ops { + void (*destroy)(struct iommufd_viommu *viommu); +}; + #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); struct iommufd_ctx *iommufd_ctx_from_fd(int fd); @@ -148,4 +170,22 @@ _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ + +/* + * Helpers for IOMMU driver to allocate driver structures that will be freed by + * the iommufd core. The free op will be called prior to freeing the memory. + */ +#define iommufd_viommu_alloc(ictx, drv_struct, member, viommu_ops) \ + ({ \ + drv_struct *ret; \ + \ + static_assert(__same_type(struct iommufd_viommu, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member.obj) == 0); \ + ret = (drv_struct *)_iommufd_object_alloc( \ + ictx, sizeof(drv_struct), IOMMUFD_OBJ_VIOMMU); \ + if (!IS_ERR(ret)) \ + ret->member.ops = viommu_ops; \ + ret; \ + }) #endif -- cgit v1.2.3 From 69d2689e57f5cb235c0609dd2f8fa10c1a832f87 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:04:22 -0800 Subject: iommufd: Add alloc_domain_nested op to iommufd_viommu_ops Allow IOMMU driver to use a vIOMMU object that holds a nesting parent hwpt/domain to allocate a nested domain. Link: https://patch.msgid.link/r/2dcdb5e405dc0deb68230564530d989d285d959c.1730836219.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 4fc2ce332f10..de9b56265c9c 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -14,6 +14,7 @@ struct device; struct file; struct iommu_group; +struct iommu_user_data; struct iommufd_access; struct iommufd_ctx; struct iommufd_device; @@ -95,9 +96,17 @@ struct iommufd_viommu { * struct iommufd_viommu_ops - vIOMMU specific operations * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory * of the vIOMMU will be free-ed by iommufd core after calling this op + * @alloc_domain_nested: Allocate a IOMMU_DOMAIN_NESTED on a vIOMMU that holds a + * nesting parent domain (IOMMU_DOMAIN_PAGING). @user_data + * must be defined in include/uapi/linux/iommufd.h. + * It must fully initialize the new iommu_domain before + * returning. Upon failure, ERR_PTR must be returned. */ struct iommufd_viommu_ops { void (*destroy)(struct iommufd_viommu *viommu); + struct iommu_domain *(*alloc_domain_nested)( + struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data); }; #if IS_ENABLED(CONFIG_IOMMUFD) -- cgit v1.2.3 From 0ce5c2477af2e2284b9c70474e4dae85db211680 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:09 -0800 Subject: iommufd/viommu: Add IOMMUFD_OBJ_VDEVICE and IOMMU_VDEVICE_ALLOC ioctl Introduce a new IOMMUFD_OBJ_VDEVICE to represent a physical device (struct device) against a vIOMMU (struct iommufd_viommu) object in a VM. This vDEVICE object (and its structure) holds all the infos and attributes in the VM, regarding the device related to the vIOMMU. As an initial patch, add a per-vIOMMU virtual ID. This can be: - Virtual StreamID on a nested ARM SMMUv3, an index to a Stream Table - Virtual DeviceID on a nested AMD IOMMU, an index to a Device Table - Virtual RID on a nested Intel VT-D IOMMU, an index to a Context Table Potentially, this vDEVICE structure would hold some vData for Confidential Compute Architecture (CCA). Use this virtual ID to index an "vdevs" xarray that belongs to a vIOMMU object. Add a new ioctl for vDEVICE allocations. Since a vDEVICE is a connection of a device object and an iommufd_viommu object, take two refcounts in the ioctl handler. Link: https://patch.msgid.link/r/cda8fd2263166e61b8191a3b3207e0d2b08545bf.1730836308.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index de9b56265c9c..71fa1e343023 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -10,6 +10,7 @@ #include #include #include +#include struct device; struct file; @@ -31,6 +32,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_ACCESS, IOMMUFD_OBJ_FAULT, IOMMUFD_OBJ_VIOMMU, + IOMMUFD_OBJ_VDEVICE, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -89,6 +91,8 @@ struct iommufd_viommu { const struct iommufd_viommu_ops *ops; + struct xarray vdevs; + unsigned int type; }; -- cgit v1.2.3 From 67db79dc1a411335f8a7e53a5e206d96b237d9a8 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:11 -0800 Subject: iommu/viommu: Add cache_invalidate to iommufd_viommu_ops This per-vIOMMU cache_invalidate op is like the cache_invalidate_user op in struct iommu_domain_ops, but wider, supporting device cache (e.g. PCI ATC invaldiations). Link: https://patch.msgid.link/r/90138505850fa6b165135e78a87b4cc7022869a4.1730836308.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 71fa1e343023..2bc735ff9511 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -16,6 +16,7 @@ struct device; struct file; struct iommu_group; struct iommu_user_data; +struct iommu_user_data_array; struct iommufd_access; struct iommufd_ctx; struct iommufd_device; @@ -105,12 +106,21 @@ struct iommufd_viommu { * must be defined in include/uapi/linux/iommufd.h. * It must fully initialize the new iommu_domain before * returning. Upon failure, ERR_PTR must be returned. + * @cache_invalidate: Flush hardware cache used by a vIOMMU. It can be used for + * any IOMMU hardware specific cache: TLB and device cache. + * The @array passes in the cache invalidation requests, in + * form of a driver data structure. A driver must update the + * array->entry_num to report the number of handled requests. + * The data structure of the array entry must be defined in + * include/uapi/linux/iommufd.h */ struct iommufd_viommu_ops { void (*destroy)(struct iommufd_viommu *viommu); struct iommu_domain *(*alloc_domain_nested)( struct iommufd_viommu *viommu, u32 flags, const struct iommu_user_data *user_data); + int (*cache_invalidate)(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array); }; #if IS_ENABLED(CONFIG_IOMMUFD) -- cgit v1.2.3 From 4f2e59ccb69866c5165525bd7aee47cd5cd00acc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Nov 2024 12:05:13 -0800 Subject: iommu: Add iommu_copy_struct_from_full_user_array helper The iommu_copy_struct_from_user_array helper can be used to copy a single entry from a user array which might not be efficient if the array is big. Add a new iommu_copy_struct_from_full_user_array to copy the entire user array at once. Update the existing iommu_copy_struct_from_user_array kdoc accordingly. Link: https://patch.msgid.link/r/5cd773d9c26920c5807d232b21d415ea79172e49.1730836308.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2574fc8abaf2..11de66237eaa 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -493,7 +493,9 @@ static inline int __iommu_copy_struct_from_user_array( * @index: Index to the location in the array to copy user data from * @min_last: The last member of the data structure @kdst points in the * initial version. - * Return 0 for success, otherwise -error. + * + * Copy a single entry from a user array. Return 0 for success, otherwise + * -error. */ #define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \ min_last) \ @@ -501,6 +503,50 @@ static inline int __iommu_copy_struct_from_user_array( kdst, user_array, data_type, index, sizeof(*(kdst)), \ offsetofend(typeof(*(kdst)), min_last)) +/** + * iommu_copy_struct_from_full_user_array - Copy iommu driver specific user + * space data from an iommu_user_data_array + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @kdst_entry_size: sizeof(*kdst) + * @user_array: Pointer to a struct iommu_user_data_array for a user space + * array + * @data_type: The data type of the @kdst. Must match with @user_array->type + * + * Copy the entire user array. kdst must have room for kdst_entry_size * + * user_array->entry_num bytes. Return 0 for success, otherwise -error. + */ +static inline int +iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, + struct iommu_user_data_array *user_array, + unsigned int data_type) +{ + unsigned int i; + int ret; + + if (user_array->type != data_type) + return -EINVAL; + if (!user_array->entry_num) + return -EINVAL; + if (likely(user_array->entry_len == kdst_entry_size)) { + if (copy_from_user(kdst, user_array->uptr, + user_array->entry_num * + user_array->entry_len)) + return -EFAULT; + } + + /* Copy item by item */ + for (i = 0; i != user_array->entry_num; i++) { + ret = copy_struct_from_user( + kdst + kdst_entry_size * i, kdst_entry_size, + user_array->uptr + user_array->entry_len * i, + user_array->entry_len); + if (ret) + return ret; + } + return 0; +} + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability -- cgit v1.2.3 From c747e67978ff473e6830afe0b1f3986dd1f444b5 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 5 Nov 2024 12:05:14 -0800 Subject: iommufd/viommu: Add iommufd_viommu_find_dev helper This avoids a bigger trouble of exposing struct iommufd_device and struct iommufd_vdevice in the public header. Link: https://patch.msgid.link/r/84fa7c624db4d4508067ccfdf42059533950180a.1730836308.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 2bc735ff9511..11110c749200 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -185,6 +185,8 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); +struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, + unsigned long vdev_id); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -192,6 +194,12 @@ _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, { return ERR_PTR(-EOPNOTSUPP); } + +static inline struct device * +iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) +{ + return NULL; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* -- cgit v1.2.3 From ad8d1e323dd37f91d0c973e2a74c7b9054219adc Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Mon, 2 Sep 2024 07:39:20 +0100 Subject: ARM: 9415/1: amba: Add dev_is_amba() function and export it for modules Add dev_is_amba() function to determine whether the device is a AMBA device. Suggested-by: Andy Shevchenko Signed-off-by: Kunwu Chan Signed-off-by: Russell King (Oracle) --- include/linux/amba/bus.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index dda2f3ea89cb..9946276aff73 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -121,6 +121,7 @@ extern const struct bus_type amba_bustype; #ifdef CONFIG_ARM_AMBA int __amba_driver_register(struct amba_driver *, struct module *); void amba_driver_unregister(struct amba_driver *); +bool dev_is_amba(const struct device *dev); #else static inline int __amba_driver_register(struct amba_driver *drv, struct module *owner) @@ -130,6 +131,10 @@ static inline int __amba_driver_register(struct amba_driver *drv, static inline void amba_driver_unregister(struct amba_driver *drv) { } +static inline bool dev_is_amba(const struct device *dev) +{ + return false; +} #endif struct amba_device *amba_device_alloc(const char *, resource_size_t, size_t); -- cgit v1.2.3 From 67e4fe3985138325c9b21193be52266750616182 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 30 Oct 2024 21:20:54 -0300 Subject: iommu/arm-smmu-v3: Use S2FWB for NESTED domains Force Write Back (FWB) changes how the S2 IOPTE's MemAttr field works. When S2FWB is supported and enabled the IOPTE will force cachable access to IOMMU_CACHE memory when nesting with a S1 and deny cachable access when !IOMMU_CACHE. When using a single stage of translation, a simple S2 domain, it doesn't change things for PCI devices as it is just a different encoding for the existing mapping of the IOMMU protection flags to cachability attributes. For non-PCI it also changes the combining rules when incoming transactions have inconsistent attributes. However, when used with a nested S1, FWB has the effect of preventing the guest from choosing a MemAttr in it's S1 that would cause ordinary DMA to bypass the cache. Consistent with KVM we wish to deny the guest the ability to become incoherent with cached memory the hypervisor believes is cachable so we don't have to flush it. Allow NESTED domains to be created if the SMMU has S2FWB support and use S2FWB for NESTING_PARENTS. This is an additional option to CANWBS. Link: https://patch.msgid.link/r/10-v4-9e99b76f3518+3a8-smmuv3_nesting_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jerry Snitselaar Reviewed-by: Donald Dutile Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/io-pgtable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index b1ecfc3cd5bc..ce86b09ae80f 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -87,6 +87,7 @@ struct io_pgtable_cfg { * attributes set in the TCR for a non-coherent page-table walker. * * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable. + * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -95,6 +96,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) #define IO_PGTABLE_QUIRK_ARM_HD BIT(7) + #define IO_PGTABLE_QUIRK_ARM_S2FWB BIT(8) unsigned long quirks; unsigned long pgsize_bitmap; unsigned int ias; -- cgit v1.2.3 From 61952bb73486fff0f5550bccdf4062d9dd0fb163 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Nov 2024 18:00:38 +0100 Subject: block: remove the write_hint field from struct request The write_hint is only used for read/write requests, which must have a bio attached to them. Just use the bio field instead. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20241112170050.1612998-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2035fad3131f..2804fe181d9d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -156,7 +156,6 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif - enum rw_hint write_hint; unsigned short ioprio; enum mq_rq_state state; -- cgit v1.2.3 From 6975c1a486a40446b5bc77a89d9c520f8296fd08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Nov 2024 18:00:39 +0100 Subject: block: remove the ioprio field from struct request The request ioprio is only initialized from the first attached bio, so requests without a bio already never set it. Directly use the bio field instead. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20241112170050.1612998-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2804fe181d9d..a28264442948 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -156,8 +156,6 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif - unsigned short ioprio; - enum mq_rq_state state; atomic_t ref; @@ -221,7 +219,9 @@ static inline bool blk_rq_is_passthrough(struct request *rq) static inline unsigned short req_get_ioprio(struct request *req) { - return req->ioprio; + if (req->bio) + return req->bio->bi_ioprio; + return 0; } #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) @@ -984,7 +984,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->nr_phys_segments = nr_segs; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; - rq->ioprio = bio_prio(bio); } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.3 From 174dd22a781b97cb355b1037f0931436a254d3be Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Nov 2024 09:38:26 -0800 Subject: srcu: Remove smp_mb() from srcu_read_unlock_lite() The srcu_read_unlock_lite() function invokes __srcu_read_unlock() instead of __srcu_read_unlock_lite(), which means that it is doing an unnecessary smp_mb(). This is harmless other than the performance degradation. This commit therefore switches to __srcu_read_unlock_lite(). Reported-by: Neeraj Upadhyay Closes: https://lore.kernel.org/all/d07e8f4a-d5ff-4c8e-8e61-50db285c57e9@amd.com/ Fixes: c0f08d6b5a61 ("srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()") Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index bab1dae3f69e..56f83237de4d 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -372,7 +372,7 @@ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) WARN_ON_ONCE(idx & ~0x1); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); srcu_lock_release(&ssp->dep_map); - __srcu_read_unlock(ssp, idx); + __srcu_read_unlock_lite(ssp, idx); } /** -- cgit v1.2.3 From a76ab5731e32d50ff5b1ae97e9dc4b23f41c23f5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:07 -0800 Subject: bpf: Find eligible subprogs for private stack support Private stack will be allocated with percpu allocator in jit time. To avoid complexity at runtime, only one copy of private stack is available per cpu per prog. So runtime recursion check is necessary to avoid stack corruption. Current private stack only supports kprobe/perf_event/tp/raw_tp which has recursion check in the kernel, and prog types that use bpf trampoline recursion check. For trampoline related prog types, currently only tracing progs have recursion checking. To avoid complexity, all async_cb subprogs use normal kernel stack including those subprogs used by both main prog subtree and async_cb subtree. Any prog having tail call also uses kernel stack. To avoid jit penalty with private stack support, a subprog stack size threshold is set such that only if the stack size is no less than the threshold, private stack is supported. The current threshold is 64 bytes. This avoids jit penality if the stack usage is small. A useless 'continue' is also removed from a loop in func check_max_stack_depth(). Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163907.2223839-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 +++++++ include/linux/filter.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3a74033d49c4..d62bb2ca1828 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -633,6 +633,12 @@ struct bpf_subprog_arg_info { }; }; +enum priv_stack_mode { + PRIV_STACK_UNKNOWN, + NO_PRIV_STACK, + PRIV_STACK_ADAPTIVE, +}; + struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ @@ -653,6 +659,7 @@ struct bpf_subprog_info { /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; + enum priv_stack_mode priv_stack_mode; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 7d7578a8eac1..3a21947f2fd4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); +bool bpf_jit_supports_private_stack(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); -- cgit v1.2.3 From e00931c02568dc6ac76f94b1ab471de05e6fdfe8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:12 -0800 Subject: bpf: Enable private stack for eligible subprogs If private stack is used by any subprog, set that subprog prog->aux->jits_use_priv_stack to be true so later jit can allocate private stack for that subprog properly. Also set env->prog->aux->jits_use_priv_stack to be true if any subprog uses private stack. This is a use case for a single main prog (no subprogs) to use private stack, and also a use case for later struct-ops progs where env->prog->aux->jits_use_priv_stack will enable recursion check if any subprog uses private stack. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163912.2224007-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7da41ae2eac8..129b29e85cec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1523,6 +1523,7 @@ struct bpf_prog_aux { bool exception_cb; bool exception_boundary; bool is_extended; /* true if extended by freplace program */ + bool jits_use_priv_stack; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; -- cgit v1.2.3 From 7d1cd70d4b16ff0216a5f6c2ae7d0fa9fa978c07 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:22 -0800 Subject: bpf, x86: Support private stack in jit Private stack is allocated in function bpf_int_jit_compile() with alignment 8. Private stack allocation size includes the stack size determined by verifier and additional space to protect stack overflow and underflow. See below an illustration: ---> memory address increasing [8 bytes to protect overflow] [normal stack] [8 bytes to protect underflow] If overflow/underflow is detected, kernel messages will be emited in dmesg like BPF private stack overflow/underflow detected for prog Fx BPF Private stack overflow/underflow detected for prog bpf_prog_a41699c234a1567a_subprog1x Those messages are generated when I made some changes to jitted code to intentially cause overflow for some progs. For the jited prog, The x86 register 9 (X86_REG_R9) is used to replace bpf frame register (BPF_REG_10). The private stack is used per subprog per cpu. The X86_REG_R9 is saved and restored around every func call (not including tailcall) to maintain correctness of X86_REG_R9. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163922.2224385-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 129b29e85cec..d32cc373dfd1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1507,6 +1507,7 @@ struct bpf_prog_aux { u32 max_rdwr_access; struct btf *attach_btf; const struct bpf_ctx_arg_aux *ctx_arg_info; + void __percpu *priv_stack_ptr; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; -- cgit v1.2.3 From 5bd36da1e37e7a78e8b38efd287de6e1394b7d6e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Nov 2024 08:39:33 -0800 Subject: bpf: Support private stack for struct_ops progs For struct_ops progs, whether a particular prog uses private stack depends on prog->aux->priv_stack_requested setting before actual insn-level verification for that prog. One particular implementation is to piggyback on struct_ops->check_member(). The next patch has an example for this. The struct_ops->check_member() sets prog->aux->priv_stack_requested to be true which enables private stack usage. The struct_ops prog follows the same rule as kprobe/tracing progs after function bpf_enable_priv_stack(). For example, even a struct_ops prog requests private stack, it could still use normal kernel stack if the stack size is small (< 64 bytes). Similar to tracing progs, nested same cpu same prog run will be skipped. A field (recursion_detected()) is added to bpf_prog_aux structure. If bpf_prog->aux->recursion_detected is implemented by the struct_ops subsystem and nested same cpu/prog happens, the function will be triggered to report an error, collect related info, etc. Acked-by: Tejun Heo Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20241112163933.2224962-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_verifier.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d32cc373dfd1..10945c8858ce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1525,9 +1525,11 @@ struct bpf_prog_aux { bool exception_boundary; bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; + bool priv_stack_requested; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; + void (*recursion_detected)(struct bpf_prog *prog); /* callback if recursion is detected */ /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d62bb2ca1828..6b7c91629176 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -879,6 +879,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) case BPF_PROG_TYPE_TRACING: return prog->expected_attach_type != BPF_TRACE_ITER; case BPF_PROG_TYPE_STRUCT_OPS: + return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: return false; default: -- cgit v1.2.3 From 7c8ce4ffb684676039b1ff9ff81c126794e8d88e Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Tue, 12 Nov 2024 22:58:49 +0800 Subject: bpf: Add kernel symbol for struct_ops trampoline Without kernel symbols for struct_ops trampoline, the unwinder may produce unexpected stacktraces. For example, the x86 ORC and FP unwinders check if an IP is in kernel text by verifying the presence of the IP's kernel symbol. When a struct_ops trampoline address is encountered, the unwinder stops due to the absence of symbol, resulting in an incomplete stacktrace that consists only of direct and indirect child functions called from the trampoline. The arm64 unwinder is another example. While the arm64 unwinder can proceed across a struct_ops trampoline address, the corresponding symbol name is displayed as "unknown", which is confusing. Thus, add kernel symbol for struct_ops trampoline. The name is bpf___, where is the type name of the struct_ops, and is the name of the member that the trampoline is linked to. Below is a comparison of stacktraces captured on x86 by perf record, before and after this patch. Before: ffffffff8116545d __lock_acquire+0xad ([kernel.kallsyms]) ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms]) ffffffff813088f4 __bpf_prog_enter+0x34 ([kernel.kallsyms]) After: ffffffff811656bd __lock_acquire+0x30d ([kernel.kallsyms]) ffffffff81167fcc lock_acquire+0xcc ([kernel.kallsyms]) ffffffff81309024 __bpf_prog_enter+0x34 ([kernel.kallsyms]) ffffffffc000d7e9 bpf__tcp_congestion_ops_cong_avoid+0x3e ([kernel.kallsyms]) ffffffff81f250a5 tcp_ack+0x10d5 ([kernel.kallsyms]) ffffffff81f27c66 tcp_rcv_established+0x3b6 ([kernel.kallsyms]) ffffffff81f3ad03 tcp_v4_do_rcv+0x193 ([kernel.kallsyms]) ffffffff81d65a18 __release_sock+0xd8 ([kernel.kallsyms]) ffffffff81d65af4 release_sock+0x34 ([kernel.kallsyms]) ffffffff81f15c4b tcp_sendmsg+0x3b ([kernel.kallsyms]) ffffffff81f663d7 inet_sendmsg+0x47 ([kernel.kallsyms]) ffffffff81d5ab40 sock_write_iter+0x160 ([kernel.kallsyms]) ffffffff8149c67b vfs_write+0x3fb ([kernel.kallsyms]) ffffffff8149caf6 ksys_write+0xc6 ([kernel.kallsyms]) ffffffff8149cb5d __x64_sys_write+0x1d ([kernel.kallsyms]) ffffffff81009200 x64_sys_call+0x1d30 ([kernel.kallsyms]) ffffffff82232d28 do_syscall_64+0x68 ([kernel.kallsyms]) ffffffff8240012f entry_SYSCALL_64_after_hwframe+0x76 ([kernel.kallsyms]) Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Xu Kuohai Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20241112145849.3436772-4-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 10945c8858ce..3ace0d6227e3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1402,7 +1402,8 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym); +void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym); +void bpf_image_ksym_add(struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -- cgit v1.2.3 From 12bbabd3cab8a7dab0ddad8ed1e671f40c7cdeeb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 12 Nov 2024 18:01:25 +0200 Subject: usb: cdns3: Synchronise PCI IDs via common data base There are a few places in the kernel where PCI IDs for different Cadence USB controllers are being used. Besides different naming, they duplicate each other. Make this all in order by providing common definitions via PCI IDs database and use in all users. While doing that, rename definitions as Roger suggested. Suggested-by: Roger Quadros Suggested-by: Greg Kroah-Hartman Signed-off-by: Andy Shevchenko Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20241112160125.2340972-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pci_ids.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 4cf6aaed5f35..eff9eccc52fc 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -121,6 +121,7 @@ #define PCI_CLASS_SERIAL_USB_OHCI 0x0c0310 #define PCI_CLASS_SERIAL_USB_EHCI 0x0c0320 #define PCI_CLASS_SERIAL_USB_XHCI 0x0c0330 +#define PCI_CLASS_SERIAL_USB_CDNS 0x0c0380 #define PCI_CLASS_SERIAL_USB_DEVICE 0x0c03fe #define PCI_CLASS_SERIAL_FIBER 0x0c04 #define PCI_CLASS_SERIAL_SMBUS 0x0c05 @@ -2421,6 +2422,9 @@ #define PCI_VENDOR_ID_QCOM 0x17cb #define PCI_VENDOR_ID_CDNS 0x17cd +#define PCI_DEVICE_ID_CDNS_USBSS 0x0100 +#define PCI_DEVICE_ID_CDNS_USB 0x0120 +#define PCI_DEVICE_ID_CDNS_USBSSP 0x0200 #define PCI_VENDOR_ID_ARECA 0x17d3 #define PCI_DEVICE_ID_ARECA_1110 0x1110 -- cgit v1.2.3 From 948ccbd95016f50ce01df5eef9440eede3b8c713 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Wed, 13 Nov 2024 16:18:26 +0800 Subject: LoongArch: KVM: Add iocsr and mmio bus simulation in kernel Add iocsr and mmio memory read and write simulation to the kernel. When the VM accesses the device address space through iocsr instructions or mmio, it does not need to return to the qemu user mode but can directly completes the access in the kernel mode. Signed-off-by: Tianrui Zhao Signed-off-by: Xianglai Li Signed-off-by: Huacai Chen --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 45be36e5285f..164d9725cb08 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -219,6 +219,7 @@ enum kvm_bus { KVM_PIO_BUS, KVM_VIRTIO_CCW_NOTIFY_BUS, KVM_FAST_MMIO_BUS, + KVM_IOCSR_BUS, KVM_NR_BUSES }; -- cgit v1.2.3 From da115c4ee29f589bb72ec2e86eb5e196b6bbcb41 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Nov 2024 15:29:15 +0100 Subject: printk: add dummy printk_force_console_enter/exit helpers The newly added interface is broken when PRINTK is disabled: drivers/tty/sysrq.c: In function '__handle_sysrq': drivers/tty/sysrq.c:601:9: error: implicit declaration of function 'printk_force_console_enter' [-Wimplicit-function-declaration] 601 | printk_force_console_enter(); | ^~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/tty/sysrq.c:611:25: error: implicit declaration of function 'printk_force_console_exit' [-Wimplicit-function-declaration] 611 | printk_force_console_exit(); | ^~~~~~~~~~~~~~~~~~~~~~~~~ Add empty stub functions for both. Fixes: ed76c07c6885 ("printk: Introduce FORCE_CON flag") Signed-off-by: Arnd Bergmann Reviewed-by: Andy Shevchenko Reviewed-by: Marcos Paulo de Souza Tested-by: Marcos Paulo de Souza Link: https://lore.kernel.org/r/20241112142939.724093-1-arnd@kernel.org Signed-off-by: Petr Mladek --- include/linux/printk.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 232e5fd06701..4217a9f412b2 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -232,6 +232,14 @@ static inline void printk_deferred_exit(void) { } +static inline void printk_force_console_enter(void) +{ +} + +static inline void printk_force_console_exit(void) +{ +} + static inline int printk_ratelimit(void) { return 0; -- cgit v1.2.3 From 1d58f7f3a1373734b2e86a246edcf1cd39359f3e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 25 Oct 2024 21:31:01 +0100 Subject: clocksource/drivers/dw_apb: Remove unused dw_apb_clockevent functions dw_apb_clockevent_pause(), dw_apb_clockevent_resume() and dw_apb_clockevent_stop() have been unused since 2021's commit 1b79fc4f2bfd ("x86/apb_timer: Remove driver for deprecated platform") Remove them. (Some of the other clockevent functions are still called by dw_apb_timer_of.c so I guess it is still in use?) Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20241025203101.241709-1-linux@treblig.org Signed-off-by: Daniel Lezcano --- include/linux/dw_apb_timer.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dw_apb_timer.h b/include/linux/dw_apb_timer.h index 82ebf9223948..f8811c46b89e 100644 --- a/include/linux/dw_apb_timer.h +++ b/include/linux/dw_apb_timer.h @@ -34,9 +34,6 @@ struct dw_apb_clocksource { }; void dw_apb_clockevent_register(struct dw_apb_clock_event_device *dw_ced); -void dw_apb_clockevent_pause(struct dw_apb_clock_event_device *dw_ced); -void dw_apb_clockevent_resume(struct dw_apb_clock_event_device *dw_ced); -void dw_apb_clockevent_stop(struct dw_apb_clock_event_device *dw_ced); struct dw_apb_clock_event_device * dw_apb_clockevent_init(int cpu, const char *name, unsigned rating, -- cgit v1.2.3 From 2bd9077b6261b8f1281d0fa74d51afe090319263 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 1 Nov 2024 14:45:23 -0600 Subject: jbd2: avoid dozens of -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the `DEFINE_RAW_FLEX()` helper for an on-stack definition of a flexible structure (`struct shash_desc`) where the size of the flexible-array member (`__ctx`) is known at compile-time, and refactor the rest of the code, accordingly. So, with this, fix 77 of the following warnings: include/linux/jbd2.h:1800:35: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Jan Kara Link: https://patch.msgid.link/ZyU94w0IALVhc9Jy@kspp Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 8aef9bb6ad57..50f7ea8714bf 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1796,22 +1796,21 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { - struct { - struct shash_desc shash; - char ctx[JBD_MAX_CHECKSUM_SIZE]; - } desc; + DEFINE_RAW_FLEX(struct shash_desc, desc, __ctx, + DIV_ROUND_UP(JBD_MAX_CHECKSUM_SIZE, + sizeof(*((struct shash_desc *)0)->__ctx))); int err; BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > JBD_MAX_CHECKSUM_SIZE); - desc.shash.tfm = journal->j_chksum_driver; - *(u32 *)desc.ctx = crc; + desc->tfm = journal->j_chksum_driver; + *(u32 *)desc->__ctx = crc; - err = crypto_shash_update(&desc.shash, address, length); + err = crypto_shash_update(desc, address, length); BUG_ON(err); - return *(u32 *)desc.ctx; + return *(u32 *)desc->__ctx; } /* Return most recent uncommitted transaction */ -- cgit v1.2.3 From 470d2bc3a0bc19a849cc7478c02d3f5ecaa1233e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Nov 2024 09:45:35 +0100 Subject: block: export blk_validate_limits While block drivers do the validation as part of committing them to the queue, users that use the limit outside of a block device context have to validate the limits and fill in the calculated values as well. So far btrfs is the only user of queue limits without a block device, and it has gotten away with that more or less by accident. But with commit 559218d43ec9 ("block: pre-calculate max_zone_append_sectors") this became fatal for setups that have small max zone append size, as it won't be limited now. Export blk_validate_limits so that it can be called directly from btrfs. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241113084541.34315-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 65f37ae70712..cd905afaf51a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -948,6 +948,7 @@ queue_limits_start_update(struct request_queue *q) int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); int queue_limits_set(struct request_queue *q, struct queue_limits *lim); +int blk_validate_limits(struct queue_limits *lim); /** * queue_limits_cancel_update - cancel an atomic update of queue limits -- cgit v1.2.3 From f3dd9ae7f03aefa5bb12a4606f3d6cca87863622 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 1 Sep 2024 17:17:09 +0200 Subject: dax: Remove an unused field in struct dax_operations .dax_supported() was apparently removed by commit 7b0800d00dae ("dax: remove dax_capable") on 2021-11. Remove the now unused function pointer from the struct dax_operations. Signed-off-by: Christophe JAILLET Reviewed-by: Jan Kara Link: https://patch.msgid.link/56b92b722ca0a6fd1387c871a6ec01bcb9bd525e.1725203804.git.christophe.jaillet@wanadoo.fr Signed-off-by: Ira Weiny --- include/linux/dax.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 9d3e3327af4c..df41a0017b31 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -27,12 +27,6 @@ struct dax_operations { */ long (*direct_access)(struct dax_device *, pgoff_t, long, enum dax_access_mode, void **, pfn_t *); - /* - * Validate whether this device is usable as an fsdax backing - * device. - */ - bool (*dax_supported)(struct dax_device *, struct block_device *, int, - sector_t, sector_t); /* zero_page_range: required operation. Zero page range */ int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); /* -- cgit v1.2.3 From e8225ab15006fbcdb14cef426a0a54475292fbbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Nov 2024 16:20:43 +0100 Subject: block: remove rq_list_move Unused now. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241113152050.157179-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a28264442948..ad26a41d13f9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -268,23 +268,6 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) -/** - * rq_list_move() - move a struct request from one list to another - * @src: The source list @rq is currently in - * @dst: The destination list that @rq will be appended to - * @rq: The request to move - * @prev: The request preceding @rq in @src (NULL if @rq is the head) - */ -static inline void rq_list_move(struct request **src, struct request **dst, - struct request *rq, struct request *prev) -{ - if (prev) - prev->rq_next = rq->rq_next; - else - *src = rq->rq_next; - rq_list_add(dst, rq); -} - /** * enum blk_eh_timer_return - How the timeout handler should proceed * @BLK_EH_DONE: The block driver completed the command or will complete it at -- cgit v1.2.3 From a3396b99990d8b4e5797e7b16fdeb64c15ae97bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Nov 2024 16:20:44 +0100 Subject: block: add a rq_list type Replace the semi-open coded request list helpers with a proper rq_list type that mirrors the bio_list and has head and tail pointers. Besides better type safety this actually allows to insert at the tail of the list, which will be useful soon. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241113152050.157179-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 88 +++++++++++++++++++++++++++++--------------------- include/linux/blkdev.h | 11 +++++-- 2 files changed, 60 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ad26a41d13f9..c61e04365677 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -229,44 +229,60 @@ static inline unsigned short req_get_ioprio(struct request *req) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) -#define rq_list_add(listptr, rq) do { \ - (rq)->rq_next = *(listptr); \ - *(listptr) = rq; \ -} while (0) - -#define rq_list_add_tail(lastpptr, rq) do { \ - (rq)->rq_next = NULL; \ - **(lastpptr) = rq; \ - *(lastpptr) = &rq->rq_next; \ -} while (0) - -#define rq_list_pop(listptr) \ -({ \ - struct request *__req = NULL; \ - if ((listptr) && *(listptr)) { \ - __req = *(listptr); \ - *(listptr) = __req->rq_next; \ - } \ - __req; \ -}) +static inline int rq_list_empty(const struct rq_list *rl) +{ + return rl->head == NULL; +} -#define rq_list_peek(listptr) \ -({ \ - struct request *__req = NULL; \ - if ((listptr) && *(listptr)) \ - __req = *(listptr); \ - __req; \ -}) +static inline void rq_list_init(struct rq_list *rl) +{ + rl->head = NULL; + rl->tail = NULL; +} + +static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq) +{ + rq->rq_next = NULL; + if (rl->tail) + rl->tail->rq_next = rq; + else + rl->head = rq; + rl->tail = rq; +} + +static inline void rq_list_add_head(struct rq_list *rl, struct request *rq) +{ + rq->rq_next = rl->head; + rl->head = rq; + if (!rl->tail) + rl->tail = rq; +} + +static inline struct request *rq_list_pop(struct rq_list *rl) +{ + struct request *rq = rl->head; + + if (rq) { + rl->head = rl->head->rq_next; + if (!rl->head) + rl->tail = NULL; + rq->rq_next = NULL; + } + + return rq; +} -#define rq_list_for_each(listptr, pos) \ - for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) +static inline struct request *rq_list_peek(struct rq_list *rl) +{ + return rl->head; +} -#define rq_list_for_each_safe(listptr, pos, nxt) \ - for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ - pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) +#define rq_list_for_each(rl, pos) \ + for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next) -#define rq_list_next(rq) (rq)->rq_next -#define rq_list_empty(list) ((list) == (struct request *) NULL) +#define rq_list_for_each_safe(rl, pos, nxt) \ + for (pos = rq_list_peek((rl)), nxt = pos->rq_next; \ + pos; pos = nxt, nxt = pos ? pos->rq_next : NULL) /** * enum blk_eh_timer_return - How the timeout handler should proceed @@ -559,7 +575,7 @@ struct blk_mq_ops { * empty the @rqlist completely, then the rest will be queued * individually by the block layer upon return. */ - void (*queue_rqs)(struct request **rqlist); + void (*queue_rqs)(struct rq_list *rqlist); /** * @get_budget: Reserve budget before queue request, once .queue_rq is @@ -868,7 +884,7 @@ static inline bool blk_mq_add_to_batch(struct request *req, else if (iob->complete != complete) return false; iob->need_ts |= blk_mq_need_time_stamp(req); - rq_list_add(&iob->req_list, req); + rq_list_add_head(&iob->req_list, req); return true; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cd905afaf51a..00212e96261a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1007,6 +1007,11 @@ extern void blk_put_queue(struct request_queue *); void blk_mark_disk_dead(struct gendisk *disk); #ifdef CONFIG_BLOCK +struct rq_list { + struct request *head; + struct request *tail; +}; + /* * blk_plug permits building a queue of related requests by holding the I/O * fragments for a short period. This allows merging of sequential requests @@ -1019,10 +1024,10 @@ void blk_mark_disk_dead(struct gendisk *disk); * blk_flush_plug() is called. */ struct blk_plug { - struct request *mq_list; /* blk-mq requests */ + struct rq_list mq_list; /* blk-mq requests */ /* if ios_left is > 1, we can batch tag/rq allocations */ - struct request *cached_rq; + struct rq_list cached_rqs; u64 cur_ktime; unsigned short nr_ios; @@ -1684,7 +1689,7 @@ int bdev_thaw(struct block_device *bdev); void bdev_fput(struct file *bdev_file); struct io_comp_batch { - struct request *req_list; + struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); }; -- cgit v1.2.3 From 00e8d290b55f2fa5c5a0500b4dccf9e090650447 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Nov 2024 16:20:46 +0100 Subject: block: don't reorder requests in blk_mq_add_to_batch LIFO ordering for batched completions is a bit unexpected and also defeats some merging optimizations in e.g. the XFS buffered write code. Now that we can easily add the request to the tail of the list do that. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241113152050.157179-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c61e04365677..c596e0e4cb75 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -884,7 +884,7 @@ static inline bool blk_mq_add_to_batch(struct request *req, else if (iob->complete != complete) return false; iob->need_ts |= blk_mq_need_time_stamp(req); - rq_list_add_head(&iob->req_list, req); + rq_list_add_tail(&iob->req_list, req); return true; } -- cgit v1.2.3 From 27184f8905ba680f22abf1707fbed24036a67119 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 13 Nov 2024 07:54:12 +0200 Subject: tpm: Opt-in in disable PCR integrity protection The initial HMAC session feature added TPM bus encryption and/or integrity protection to various in-kernel TPM operations. This can cause performance bottlenecks with IMA, as it heavily utilizes PCR extend operations. In order to mitigate this performance issue, introduce a kernel command-line parameter to the TPM driver for disabling the integrity protection for PCR extend operations (i.e. TPM2_PCR_Extend). Cc: James Bottomley Link: https://lore.kernel.org/linux-integrity/20241015193916.59964-1-zohar@linux.ibm.com/ Fixes: 6519fea6fd37 ("tpm: add hmac checks to tpm2_pcr_extend()") Tested-by: Mimi Zohar Co-developed-by: Roberto Sassu Signed-off-by: Roberto Sassu Co-developed-by: Mimi Zohar Signed-off-by: Mimi Zohar Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 587b96b4418e..20a40ade8030 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -421,6 +421,7 @@ void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value); u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset); u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset); u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset); +void tpm_buf_append_handle(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle); /* * Check if TPM device is in the firmware upgrade mode. @@ -505,6 +506,8 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, u8 *passphrase, int passphraselen); +void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf, + u8 attributes, u8 *passphrase, int passphraselen); static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, -- cgit v1.2.3 From 704806ca400e5daa86c110f14bfdda9d28203bb7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 13 Nov 2024 13:51:55 +0200 Subject: virtio: Extend the admin command to include the result size Extend the admin command by incorporating a result size field. This allows higher layers to determine the actual result size from the backend when this information is not included in the result_sg. The additional information introduced here will be used in subsequent patches of this series. Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20241113115200.209269-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/virtio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 306137a15d07..b5f7a611715a 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -111,6 +111,7 @@ struct virtio_admin_cmd { struct scatterlist *data_sg; struct scatterlist *result_sg; struct completion completion; + u32 result_sg_size; int ret; }; -- cgit v1.2.3 From 52a22c0ed03ce23f20df81f79b23cb6637716fae Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 13 Nov 2024 13:51:57 +0200 Subject: virtio-pci: Introduce APIs to execute device parts admin commands Introduce APIs to handle the execution of device parts admin commands. These APIs cover functionalities such as mode setting, object creation and destruction, and operations like parts get/set and metadata retrieval. These APIs will be utilized in upcoming patches within this series. Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20241113115200.209269-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/virtio_pci_admin.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_pci_admin.h b/include/linux/virtio_pci_admin.h index f4a100a0fe2e..dffc92c17ad2 100644 --- a/include/linux/virtio_pci_admin.h +++ b/include/linux/virtio_pci_admin.h @@ -20,4 +20,15 @@ int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev, u64 *bar_offset); #endif +bool virtio_pci_admin_has_dev_parts(struct pci_dev *pdev); +int virtio_pci_admin_mode_set(struct pci_dev *pdev, u8 mode); +int virtio_pci_admin_obj_create(struct pci_dev *pdev, u16 obj_type, u8 operation_type, + u32 *obj_id); +int virtio_pci_admin_obj_destroy(struct pci_dev *pdev, u16 obj_type, u32 id); +int virtio_pci_admin_dev_parts_metadata_get(struct pci_dev *pdev, u16 obj_type, + u32 id, u8 metadata_type, u32 *out); +int virtio_pci_admin_dev_parts_get(struct pci_dev *pdev, u16 obj_type, u32 id, + u8 get_type, struct scatterlist *res_sg, u32 *res_size); +int virtio_pci_admin_dev_parts_set(struct pci_dev *pdev, struct scatterlist *data_sg); + #endif /* _LINUX_VIRTIO_PCI_ADMIN_H */ -- cgit v1.2.3 From eb94b7bb10109a14a5431a67e5d8e31cfa06b395 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 11 Nov 2024 00:17:34 +0100 Subject: net: Make copy_safe_from_sockptr() match documentation copy_safe_from_sockptr() return copy_from_sockptr() return copy_from_sockptr_offset() return copy_from_user() copy_from_user() does not return an error on fault. Instead, it returns a number of bytes that were not copied. Have it handled. Patch has a side effect: it un-breaks garbage input handling of nfc_llcp_setsockopt() and mISDN's data_sock_setsockopt(). Fixes: 6309863b31dd ("net: add copy_safe_from_sockptr() helper") Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241111-sockptr-copy-ret-fix-v1-1-a520083a93fb@rbox.co Signed-off-by: Jakub Kicinski --- include/linux/sockptr.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index fc5a206c4043..195debe2b1db 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -77,7 +77,9 @@ static inline int copy_safe_from_sockptr(void *dst, size_t ksize, { if (optlen < ksize) return -EINVAL; - return copy_from_sockptr(dst, optval, ksize); + if (copy_from_sockptr(dst, optval, ksize)) + return -EFAULT; + return 0; } static inline int copy_struct_from_sockptr(void *dst, size_t ksize, -- cgit v1.2.3 From 16220cb315a0a10d2a003d9af534988686e675ea Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Fri, 8 Nov 2024 17:57:05 -0800 Subject: net: dsa: microchip: Add LAN9646 switch support to KSZ DSA driver LAN9646 switch is a 6-port switch with functions like KSZ9897. It has 4 internal PHYs and 1 SGMII port. The chip id read from hardware is same as KSZ9477, so software driver needs to create a new chip id and group allowable functions under its chip data structure to differentiate the product. Signed-off-by: Tristram Ha Link: https://patch.msgid.link/20241109015705.82685-3-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/platform_data/microchip-ksz.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index 2ee1a679e592..0e0e8fe6975f 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -42,6 +42,7 @@ enum ksz_chip_id { LAN9372_CHIP_ID = 0x00937200, LAN9373_CHIP_ID = 0x00937300, LAN9374_CHIP_ID = 0x00937400, + LAN9646_CHIP_ID = 0x00964600, }; struct ksz_platform_data { -- cgit v1.2.3 From e311b04db66aaed1819bdd479d42c4f338f105b9 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 Nov 2024 12:56:45 +0000 Subject: soundwire: Update the includes on the sdw.h header There are quite a few things used in the sdw.h header that it relies on the consumer to include. If something is used directly in the header it should be included by the header. Update the includes to cover the missing items, or add forward declarations for things that are only used as pointers. Whilst making the change also alphabetise the list of includes. Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20241112125646.590240-1-ckeepax@opensource.cirrus.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 73f655334fe9..1fd4b126287f 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -4,12 +4,19 @@ #ifndef __SOUNDWIRE_H #define __SOUNDWIRE_H +#include #include -#include +#include +#include #include #include +#include #include -#include +#include +#include + +struct dentry; +struct fwnode_handle; struct sdw_bus; struct sdw_slave; -- cgit v1.2.3 From dd690b31de0ed46adc5856698880560b900386ba Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 Nov 2024 12:56:46 +0000 Subject: soundwire: Minor formatting fixups in sdw.h header Fixup some minor formatting and whitespace in the sdw.h header file. Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20241112125646.590240-2-ckeepax@opensource.cirrus.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 1fd4b126287f..784656f740f6 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -616,7 +616,6 @@ struct sdw_slave_ops { int (*clk_stop)(struct sdw_slave *slave, enum sdw_clk_stop_mode mode, enum sdw_clk_stop_type type); - }; /** @@ -690,8 +689,7 @@ struct sdw_master_device { container_of(d, struct sdw_master_device, dev) struct sdw_driver { - int (*probe)(struct sdw_slave *sdw, - const struct sdw_device_id *id); + int (*probe)(struct sdw_slave *sdw, const struct sdw_device_id *id); int (*remove)(struct sdw_slave *sdw); void (*shutdown)(struct sdw_slave *sdw); @@ -710,7 +708,7 @@ struct sdw_driver { SDW_SLAVE_ENTRY_EXT((_mfg_id), (_part_id), 0, 0, (_drv_data)) int sdw_handle_slave_status(struct sdw_bus *bus, - enum sdw_slave_status status[]); + enum sdw_slave_status status[]); /* * SDW master structures and APIs @@ -792,15 +790,14 @@ struct sdw_enable_ch { */ struct sdw_master_port_ops { int (*dpn_set_port_params)(struct sdw_bus *bus, - struct sdw_port_params *port_params, - unsigned int bank); + struct sdw_port_params *port_params, + unsigned int bank); int (*dpn_set_port_transport_params)(struct sdw_bus *bus, - struct sdw_transport_params *transport_params, - enum sdw_reg_bank bank); - int (*dpn_port_prep)(struct sdw_bus *bus, - struct sdw_prepare_ch *prepare_ch); + struct sdw_transport_params *transport_params, + enum sdw_reg_bank bank); + int (*dpn_port_prep)(struct sdw_bus *bus, struct sdw_prepare_ch *prepare_ch); int (*dpn_port_enable_ch)(struct sdw_bus *bus, - struct sdw_enable_ch *enable_ch, unsigned int bank); + struct sdw_enable_ch *enable_ch, unsigned int bank); }; struct sdw_msg; @@ -835,14 +832,11 @@ struct sdw_defer { */ struct sdw_master_ops { int (*read_prop)(struct sdw_bus *bus); - u64 (*override_adr) - (struct sdw_bus *bus, u64 addr); - enum sdw_command_response (*xfer_msg) - (struct sdw_bus *bus, struct sdw_msg *msg); - enum sdw_command_response (*xfer_msg_defer) - (struct sdw_bus *bus); + u64 (*override_adr)(struct sdw_bus *bus, u64 addr); + enum sdw_command_response (*xfer_msg)(struct sdw_bus *bus, struct sdw_msg *msg); + enum sdw_command_response (*xfer_msg_defer)(struct sdw_bus *bus); int (*set_bus_conf)(struct sdw_bus *bus, - struct sdw_bus_params *params); + struct sdw_bus_params *params); int (*pre_bank_switch)(struct sdw_bus *bus); int (*post_bank_switch)(struct sdw_bus *bus); u32 (*read_ping_status)(struct sdw_bus *bus); @@ -1019,12 +1013,12 @@ void sdw_release_stream(struct sdw_stream_runtime *stream); int sdw_compute_params(struct sdw_bus *bus); int sdw_stream_add_master(struct sdw_bus *bus, - struct sdw_stream_config *stream_config, - const struct sdw_port_config *port_config, - unsigned int num_ports, - struct sdw_stream_runtime *stream); + struct sdw_stream_config *stream_config, + const struct sdw_port_config *port_config, + unsigned int num_ports, + struct sdw_stream_runtime *stream); int sdw_stream_remove_master(struct sdw_bus *bus, - struct sdw_stream_runtime *stream); + struct sdw_stream_runtime *stream); int sdw_startup_stream(void *sdw_substream); int sdw_prepare_stream(struct sdw_stream_runtime *stream); int sdw_enable_stream(struct sdw_stream_runtime *stream); -- cgit v1.2.3 From 04782e63917dbcb60932fe93df52c4a4e3859d07 Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Wed, 13 Nov 2024 19:01:52 +0000 Subject: perf/core: Hoist perf_instruction_pointer() and perf_misc_flags() For clarity, rename the arch-specific definitions of these functions to perf_arch_* to denote they are arch-specifc. Define the generic-named functions in one place where they can call the arch-specific ones as needed. Signed-off-by: Colton Lewis Signed-off-by: Ingo Molnar Reviewed-by: Oliver Upton Acked-by: Thomas Richter Acked-by: Mark Rutland Acked-by: Madhavan Srinivasan Acked-by: Kan Liang Link: https://lore.kernel.org/r/20241113190156.2145593-3-coltonlewis@google.com --- include/linux/perf_event.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 91b310052a7c..3b4bf5e329f6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1676,10 +1676,13 @@ extern void perf_tp_event(u16 event_type, u64 count, void *record, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); -#ifndef perf_misc_flags -# define perf_misc_flags(regs) \ +extern unsigned long perf_misc_flags(struct pt_regs *regs); +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + +#ifndef perf_arch_misc_flags +# define perf_arch_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) -# define perf_instruction_pointer(regs) instruction_pointer(regs) +# define perf_arch_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs -- cgit v1.2.3 From 2c47e7a74f445426d156278e339b7abb259e50de Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Wed, 13 Nov 2024 19:01:55 +0000 Subject: perf/core: Correct perf sampling with guest VMs Previously any PMU overflow interrupt that fired while a VCPU was loaded was recorded as a guest event whether it truly was or not. This resulted in nonsense perf recordings that did not honor perf_event_attr.exclude_guest and recorded guest IPs where it should have recorded host IPs. Rework the sampling logic to only record guest samples for events with exclude_guest = 0. This way any host-only events with exclude_guest set will never see unexpected guest samples. The behaviour of events with exclude_guest = 0 is unchanged. Note that events configured to sample both host and guest may still misattribute a PMI that arrived in the host as a guest event depending on KVM arch and vendor behavior. Signed-off-by: Colton Lewis Signed-off-by: Ingo Molnar Reviewed-by: Oliver Upton Acked-by: Mark Rutland Acked-by: Kan Liang Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Namhyung Kim Link: https://lore.kernel.org/r/20241113190156.2145593-6-coltonlewis@google.com --- include/linux/perf_event.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3b4bf5e329f6..cb99ec8c9e96 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1676,8 +1676,9 @@ extern void perf_tp_event(u16 event_type, u64 count, void *record, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); -extern unsigned long perf_misc_flags(struct pt_regs *regs); -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); +extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); +extern unsigned long perf_instruction_pointer(struct perf_event *event, + struct pt_regs *regs); #ifndef perf_arch_misc_flags # define perf_arch_misc_flags(regs) \ @@ -1688,6 +1689,22 @@ extern unsigned long perf_instruction_pointer(struct pt_regs *regs); # define perf_arch_bpf_user_pt_regs(regs) regs #endif +#ifndef perf_arch_guest_misc_flags +static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) +{ + unsigned long guest_state = perf_guest_state(); + + if (!(guest_state & PERF_GUEST_ACTIVE)) + return 0; + + if (guest_state & PERF_GUEST_USER) + return PERF_RECORD_MISC_GUEST_USER; + else + return PERF_RECORD_MISC_GUEST_KERNEL; +} +# define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) +#endif + static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; -- cgit v1.2.3 From 9fed2c0f2f0771b990d068ef0a2b32e770ae6d48 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 13 Nov 2024 09:17:51 -0500 Subject: fs: reduce pointer chasing in is_mgtime() test The is_mgtime test checks whether the FS_MGTIME flag is set in the fstype. To get there from the inode though, we have to dereference 3 pointers. Add a new IOP_MGTIME flag, and have inode_init_always() set that flag when the fstype flag is set. Then, make is_mgtime test for IOP_MGTIME instead. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241113-mgtime-v1-1-84e256980e11@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b1a3bd07711b..a9c6c8cbda50 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -623,6 +623,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 +#define IOP_MGTIME 0x0020 /* * Keep mostly read-only and often accessed (especially for @@ -2581,7 +2582,7 @@ struct file_system_type { */ static inline bool is_mgtime(const struct inode *inode) { - return inode->i_sb->s_type->fs_flags & FS_MGTIME; + return inode->i_opflags & IOP_MGTIME; } extern struct dentry *mount_bdev(struct file_system_type *fs_type, -- cgit v1.2.3 From 0c32840763b1579c923b4216c18bb756ca4ba473 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 14 Nov 2024 08:03:57 -0500 Subject: platform/x86/intel/pmt: allow user offset for PMT callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Usage of the telem sysfs file allows for partial reads at an offset. The current callback method returns the buffer starting from offset 0 only. Include the requested offset in the callback and update the necessary address calculations with the offset. Note: offset addition is moved from the caller to the local usage. For non-callback usage this is unchanged behavior. Fixes: e92affc74cd8 ("platform/x86/intel/vsec: Add PMT read callbacks") Reviewed-by: Andy Shevchenko Signed-off-by: Michael J. Ruhl Link: https://lore.kernel.org/r/20241114130358.2467787-2-michael.j.ruhl@intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_vsec.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 11ee185566c3..b94beab64610 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -74,10 +74,11 @@ enum intel_vsec_quirks { * @pdev: PCI device reference for the callback's use * @guid: ID of data to acccss * @data: buffer for the data to be copied + * @off: offset into the requested buffer * @count: size of buffer */ struct pmt_callbacks { - int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, u32 count); + int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count); }; /** -- cgit v1.2.3 From 0b3144da31f855fce652303f588416a60991bdef Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 13 Nov 2024 07:49:22 +0100 Subject: USB: make single lock for all usb dynamic id lists There are a number of places where we accidentally pass in a constant structure to later cast it off to a dynamic one, and then attempt to grab a lock on it, which is not a good idea. To help resolve this, move the dynamic id lock out of the dynamic id structure for the driver and into one single lock for all USB dynamic ids. As this lock should never have any real contention (it's only every accessed when a device is added or removed, which is always serialized) there should not be any difference except for some memory savings. Note, this just converts the existing use of the dynamic id lock to the new static lock, there is one place that is accessing the dynamic id list without grabbing the lock, that will be fixed up in a follow-on change. Cc: Johan Hovold Cc: Herve Codina Cc: Rob Herring Cc: Alan Stern Cc: Grant Grundler Cc: Oliver Neukum Cc: Yajun Deng Cc: Douglas Anderson Cc: linux-usb@vger.kernel.org Link: https://lore.kernel.org/r/2024111322-kindly-finalist-d247@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 672d8fc2abdb..b66b1af3e439 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1129,8 +1129,8 @@ static inline int usb_make_path(struct usb_device *dev, char *buf, size_t size) /* ----------------------------------------------------------------------- */ /* Stuff for dynamic usb ids */ +extern struct mutex usb_dynids_lock; struct usb_dynids { - spinlock_t lock; struct list_head list; }; -- cgit v1.2.3 From 2f3aab7aecb827ba93c6222646eb0faa8228d590 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 13 Nov 2024 15:04:40 +0100 Subject: USB: make to_usb_driver() use container_of_const() Turns out that we have some const pointers being passed to to_usb_driver() but were not catching this. Change the macro to properly propagate the const-ness of the pointer so that we will notice when we try to write to memory that we shouldn't be writing to. This requires fixing up the usb_match_dynamic_id() function as well, because it can handle a const * to struct usb_driver. Cc: Johan Hovold Cc: Alan Stern Cc: Grant Grundler Cc: Yajun Deng Cc: Oliver Neukum Cc: Douglas Anderson Cc: linux-usb@vger.kernel.org Link: https://lore.kernel.org/r/2024111339-shaky-goldsmith-b233@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index b66b1af3e439..7a9e96f9d886 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1243,7 +1243,7 @@ struct usb_driver { unsigned int disable_hub_initiated_lpm:1; unsigned int soft_unbind:1; }; -#define to_usb_driver(d) container_of(d, struct usb_driver, driver) +#define to_usb_driver(d) container_of_const(d, struct usb_driver, driver) /** * struct usb_device_driver - identifies USB device driver to usbcore -- cgit v1.2.3 From d6fa15bbcf9604e3c14816410550d2cf22b955e4 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 13 Nov 2024 15:04:41 +0100 Subject: USB: make to_usb_device_driver() use container_of_const() Turns out that we have some const pointers being passed to to_usb_device_driver() but were not catching this. Change the macro to properly propagate the const-ness of the pointer so that we will notice when we try to write to memory that we shouldn't be writing to. This requires fixing up the usb_driver_applicable() function as well, because it can handle a const * to struct usb_driver. Cc: Johan Hovold Cc: Alan Stern Cc: Grant Grundler Cc: Yajun Deng Cc: Oliver Neukum Cc: Douglas Anderson Cc: linux-usb@vger.kernel.org Link: https://lore.kernel.org/r/2024111342-lagoon-reapprove-5e49@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 7a9e96f9d886..cfa8005e24f9 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1294,8 +1294,7 @@ struct usb_device_driver { unsigned int supports_autosuspend:1; unsigned int generic_subclass:1; }; -#define to_usb_device_driver(d) container_of(d, struct usb_device_driver, \ - driver) +#define to_usb_device_driver(d) container_of_const(d, struct usb_device_driver, driver) /** * struct usb_class_driver - identifies a USB driver that wants to use the USB major number -- cgit v1.2.3 From d96c77bd4eeba469bddbbb14323d2191684da82a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 8 Nov 2024 04:56:31 -0500 Subject: KVM: x86: switch hugepage recovery thread to vhost_task kvm_vm_create_worker_thread() is meant to be used for kthreads that can consume significant amounts of CPU time on behalf of a VM or in response to how the VM behaves (for example how it accesses its memory). Therefore it wants to charge the CPU time consumed by that work to the VM's container. However, because of these threads, cgroups which have kvm instances inside never complete freezing. This can be trivially reproduced: root@test ~# mkdir /sys/fs/cgroup/test root@test ~# echo $$ > /sys/fs/cgroup/test/cgroup.procs root@test ~# qemu-system-x86_64 -nographic -enable-kvm and in another terminal: root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze root@test ~# cat /sys/fs/cgroup/test/cgroup.events populated 1 frozen 0 The cgroup freezing happens in the signal delivery path but kvm_nx_huge_page_recovery_worker, while joining non-root cgroups, never calls into the signal delivery path and thus never gets frozen. Because the cgroup freezer determines whether a given cgroup is frozen by comparing the number of frozen threads to the total number of threads in the cgroup, the cgroup never becomes frozen and users waiting for the state transition may hang indefinitely. Since the worker kthread is tied to a user process, it's better if it behaves similarly to user tasks as much as possible, including being able to send SIGSTOP and SIGCONT. In fact, vhost_task is all that kvm_vm_create_worker_thread() wanted to be and more: not only it inherits the userspace process's cgroups, it has other niceties like being parented properly in the process tree. Use it instead of the homegrown alternative. Incidentally, the new code is also better behaved when you flip recovery back and forth to disabled and back to enabled. If your recovery period is 1 minute, it will run the next recovery after 1 minute independent of how many times you flipped the parameter. (Commit message based on emails from Tejun). Reported-by: Tejun Heo Reported-by: Luca Boccassi Acked-by: Tejun Heo Tested-by: Luca Boccassi Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 03e4d26e3bcc..401439bb21e3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2425,12 +2425,6 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ -typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); - -int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, - uintptr_t data, const char *name, - struct task_struct **thread_ptr); - #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From 4eb5e9c6c4cdfca9fb0577f62580db46f5acd1a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Nov 2024 17:03:54 +0100 Subject: clk: fixed-factor: add clk_hw_register_fixed_factor_index() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add non-devres version of clk_hw_register_fixed_factor(), with parent targeted using its index. Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20241106-mbly-clk-v2-3-84cfefb3f485@bootlin.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index dbe793964c24..52b46c63a2bd 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1142,6 +1142,9 @@ struct clk_hw *clk_hw_register_fixed_factor_with_accuracy_fwname(struct device * struct device_node *np, const char *name, const char *fw_name, unsigned long flags, unsigned int mult, unsigned int div, unsigned long acc); +struct clk_hw *clk_hw_register_fixed_factor_index(struct device *dev, + const char *name, unsigned int index, unsigned long flags, + unsigned int mult, unsigned int div); void clk_hw_unregister_fixed_factor(struct clk_hw *hw); struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev, const char *name, const char *parent_name, unsigned long flags, -- cgit v1.2.3 From 721aa69e708b7432af83c4bb00a30e2b7c27da28 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 8 Nov 2024 07:54:47 +0100 Subject: net: phy: convert eee_broken_modes to a linkmode bitmap eee_broken_modes has a eee_cap1 register layout currently. This doen't allow to flag e.g. 2.5Gbps or 5Gbps BaseT EEE as broken. To overcome this limitation switch eee_broken_modes to a linkmode bitmap. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/dfe0c9ff-84b0-4328-86d7-e917ebc084a1@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 1e4127c495c0..a6622f873d03 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -721,16 +721,15 @@ struct phy_device { /* used for eee validation and configuration*/ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); + /* Energy efficient ethernet modes which should be prohibited */ + __ETHTOOL_DECLARE_LINK_MODE_MASK(eee_broken_modes); bool eee_enabled; + bool enable_tx_lpi; + struct eee_config eee_cfg; /* Host supported PHY interface types. Should be ignored if empty. */ DECLARE_PHY_INTERFACE_MASK(host_interfaces); - /* Energy efficient ethernet modes which should be prohibited */ - u32 eee_broken_modes; - bool enable_tx_lpi; - struct eee_config eee_cfg; - #ifdef CONFIG_LED_TRIGGER_PHY struct phy_led_trigger *phy_led_triggers; unsigned int phy_num_led_triggers; -- cgit v1.2.3 From ed623fb8e38e2a241da12864778ec9c9cf930c65 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 8 Nov 2024 08:07:10 +0100 Subject: net: phy: add phy_set_eee_broken Add an accessor for eee_broken_modes, so that drivers don't have to deal with phylib internals. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/0f8ee279-d40d-4489-a3b0-d993472d744a@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index a6622f873d03..b8346db42727 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1263,6 +1263,16 @@ void of_set_phy_eee_broken(struct phy_device *phydev); void of_set_phy_timing_role(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); +/** + * phy_set_eee_broken - Mark an EEE mode as broken so that it isn't advertised. + * @phydev: The phy_device struct + * @link_mode: The broken EEE mode + */ +static inline void phy_set_eee_broken(struct phy_device *phydev, u32 link_mode) +{ + linkmode_set_bit(link_mode, phydev->eee_broken_modes); +} + /** * phy_is_started - Convenience function to check whether PHY is started * @phydev: The phy_device struct -- cgit v1.2.3 From e7cb7cf43afb96a4fd8f68c2dfb9b2fbdd444654 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Tue, 12 Nov 2024 20:54:29 +1000 Subject: include: mdio: Remove mdio45_ethtool_gset() mdio45_ethtool_gset() is never called, so let's remove it. Signed-off-by: Alistair Francis Link: https://patch.msgid.link/20241112105430.438491-1-alistair@alistair23.me Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index efeca5bd7600..c63f43645d50 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -173,22 +173,6 @@ mdio45_ethtool_ksettings_get_npage(const struct mdio_if_info *mdio, struct ethtool_link_ksettings *cmd, u32 npage_adv, u32 npage_lpa); -/** - * mdio45_ethtool_gset - get settings for ETHTOOL_GSET - * @mdio: MDIO interface - * @ecmd: Ethtool request structure - * - * Since the CSRs for auto-negotiation using next pages are not fully - * standardised, this function does not attempt to decode them. Use - * mdio45_ethtool_gset_npage() to specify advertisement bits from next - * pages. - */ -static inline void mdio45_ethtool_gset(const struct mdio_if_info *mdio, - struct ethtool_cmd *ecmd) -{ - mdio45_ethtool_gset_npage(mdio, ecmd, 0, 0); -} - /** * mdio45_ethtool_ksettings_get - get settings for ETHTOOL_GLINKSETTINGS * @mdio: MDIO interface -- cgit v1.2.3 From 575092a7f0ce4c6d5907cd908d28c0f69690a136 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Tue, 12 Nov 2024 20:54:30 +1000 Subject: mdio: Remove mdio45_ethtool_gset_npage() The mdio45_ethtool_gset_npage() function isn't called, so let's remove it. Signed-off-by: Alistair Francis Link: https://patch.msgid.link/20241112105430.438491-2-alistair@alistair23.me Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index c63f43645d50..3c3deac57894 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -165,9 +165,6 @@ extern int mdio_set_flag(const struct mdio_if_info *mdio, bool sense); extern int mdio45_links_ok(const struct mdio_if_info *mdio, u32 mmds); extern int mdio45_nway_restart(const struct mdio_if_info *mdio); -extern void mdio45_ethtool_gset_npage(const struct mdio_if_info *mdio, - struct ethtool_cmd *ecmd, - u32 npage_adv, u32 npage_lpa); extern void mdio45_ethtool_ksettings_get_npage(const struct mdio_if_info *mdio, struct ethtool_link_ksettings *cmd, -- cgit v1.2.3 From 9e43ad7a1edef268acac603e1975c8f50a20d02f Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 13 Nov 2024 12:13:09 +0000 Subject: net: ethtool: only allow set_rxnfc with rss + ring_cookie if driver opts in Ethtool ntuple filters with FLOW_RSS were originally defined as adding the base queue ID (ring_cookie) to the value from the indirection table, so that the same table could distribute over more than one set of queues when used by different filters. However, some drivers / hardware ignore the ring_cookie, and simply use the indirection table entries as queue IDs directly. Thus, for drivers which have not opted in by setting ethtool_ops.cap_rss_rxnfc_adds to declare that they support the original (addition) semantics, reject in ethtool_set_rxnfc any filter which combines FLOW_RSS and a nonzero ring. (For a ring_cookie of zero, both behaviours are equivalent.) Set the cap bit in sfc, as it is known to support this feature. Signed-off-by: Edward Cree Reviewed-by: Martin Habets Link: https://patch.msgid.link/cc3da0844083b0e301a33092a6299e4042b65221.1731499022.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 1199e308c8dd..299280c94d07 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -734,6 +734,9 @@ struct kernel_ethtool_ts_info { * @rxfh_per_ctx_key: device supports setting different RSS key for each * additional context. Netlink API should report hfunc, key, and input_xfrm * for every context, not just context 0. + * @cap_rss_rxnfc_adds: device supports nonzero ring_cookie in filters with + * %FLOW_RSS flag; the queue ID from the filter is added to the value from + * the indirection table to determine the delivery queue. * @rxfh_indir_space: max size of RSS indirection tables, if indirection table * size as returned by @get_rxfh_indir_size may change during lifetime * of the device. Leave as 0 if the table size is constant. @@ -956,6 +959,7 @@ struct ethtool_ops { u32 cap_rss_ctx_supported:1; u32 cap_rss_sym_xor_supported:1; u32 rxfh_per_ctx_key:1; + u32 cap_rss_rxnfc_adds:1; u32 rxfh_indir_space; u16 rxfh_key_space; u16 rxfh_priv_size; -- cgit v1.2.3 From fd7b4f9f46d46acbc7af3a439bb0d869efdc5c58 Mon Sep 17 00:00:00 2001 From: Qun-Wei Lin Date: Wed, 13 Nov 2024 12:25:43 +0800 Subject: sched/task_stack: fix object_is_on_stack() for KASAN tagged pointers When CONFIG_KASAN_SW_TAGS and CONFIG_KASAN_STACK are enabled, the object_is_on_stack() function may produce incorrect results due to the presence of tags in the obj pointer, while the stack pointer does not have tags. This discrepancy can lead to incorrect stack object detection and subsequently trigger warnings if CONFIG_DEBUG_OBJECTS is also enabled. Example of the warning: ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at lib/debugobjects.c:557 __debug_object_init+0x330/0x364 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc5 #4 Hardware name: linux,dummy-virt (DT) pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __debug_object_init+0x330/0x364 lr : __debug_object_init+0x330/0x364 sp : ffff800082ea7b40 x29: ffff800082ea7b40 x28: 98ff0000c0164518 x27: 98ff0000c0164534 x26: ffff800082d93ec8 x25: 0000000000000001 x24: 1cff0000c00172a0 x23: 0000000000000000 x22: ffff800082d93ed0 x21: ffff800081a24418 x20: 3eff800082ea7bb0 x19: efff800000000000 x18: 0000000000000000 x17: 00000000000000ff x16: 0000000000000047 x15: 206b63617473206e x14: 0000000000000018 x13: ffff800082ea7780 x12: 0ffff800082ea78e x11: 0ffff800082ea790 x10: 0ffff800082ea79d x9 : 34d77febe173e800 x8 : 34d77febe173e800 x7 : 0000000000000001 x6 : 0000000000000001 x5 : feff800082ea74b8 x4 : ffff800082870a90 x3 : ffff80008018d3c4 x2 : 0000000000000001 x1 : ffff800082858810 x0 : 0000000000000050 Call trace: __debug_object_init+0x330/0x364 debug_object_init_on_stack+0x30/0x3c schedule_hrtimeout_range_clock+0xac/0x26c schedule_hrtimeout+0x1c/0x30 wait_task_inactive+0x1d4/0x25c kthread_bind_mask+0x28/0x98 init_rescuer+0x1e8/0x280 workqueue_init+0x1a0/0x3cc kernel_init_freeable+0x118/0x200 kernel_init+0x28/0x1f0 ret_from_fork+0x10/0x20 ---[ end trace 0000000000000000 ]--- ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated. ------------[ cut here ]------------ Link: https://lkml.kernel.org/r/20241113042544.19095-1-qun-wei.lin@mediatek.com Signed-off-by: Qun-Wei Lin Cc: Andrew Yang Cc: AngeloGioacchino Del Regno Cc: Casper Li Cc: Catalin Marinas Cc: Chinwen Chang Cc: Kent Overstreet Cc: Matthias Brugger Cc: Pasha Tatashin Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- include/linux/sched/task_stack.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index bf10bdb487dd..6c2fef89a4fd 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef CONFIG_THREAD_INFO_IN_TASK @@ -89,6 +90,7 @@ static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); + obj = kasan_reset_tag(obj); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } -- cgit v1.2.3 From 05d4532b60e3e6e2a094ec56a88d1def50bd2430 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Fri, 1 Nov 2024 13:44:02 -0700 Subject: memcg/hugetlb: add hugeTLB counters to memcg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a new counter to memory.stat that tracks hugeTLB usage, only if hugeTLB accounting is done to memory.current. This feature is enabled the same way hugeTLB accounting is enabled, via the memory_hugetlb_accounting mount flag for cgroupsv2. 1. Why is this patch necessary? Currently, memcg hugeTLB accounting is an opt-in feature [1] that adds hugeTLB usage to memory.current. However, the metric is not reported in memory.stat. Given that users often interpret memory.stat as a breakdown of the value reported in memory.current, the disparity between the two reports can be confusing. This patch solves this problem by including the metric in memory.stat as well, but only if it is also reported in memory.current (it would also be confusing if the value was reported in memory.stat, but not in memory.current) Aside from the consistency between the two files, we also see benefits in observability. Userspace might be interested in the hugeTLB footprint of cgroups for many reasons. For instance, system admins might want to verify that hugeTLB usage is distributed as expected across tasks: i.e. memory-intensive tasks are using more hugeTLB pages than tasks that don't consume a lot of memory, or are seen to fault frequently. Note that this is separate from wanting to inspect the distribution for limiting purposes (in which case, hugeTLB controller makes more sense). 2. We already have a hugeTLB controller. Why not use that? It is true that hugeTLB tracks the exact value that we want. In fact, by enabling the hugeTLB controller, we get all of the observability benefits that I mentioned above, and users can check the total hugeTLB usage, verify if it is distributed as expected, etc. With this said, there are 2 problems: (a) They are still not reported in memory.stat, which means the disparity between the memcg reports are still there. (b) We cannot reasonably expect users to enable the hugeTLB controller just for the sake of hugeTLB usage reporting, especially since they don't have any use for hugeTLB usage enforcing [2]. 3. Implementation Details: In the alloc / free hugetlb functions, we call lruvec_stat_mod_folio regardless of whether memcg accounts hugetlb. mem_cgroup_commit_charge which is called from alloc_hugetlb_folio will set memcg for the folio only if the CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING cgroup mount option is used, so lruvec_stat_mod_folio accounts per-memcg hugetlb counters only if the feature is enabled. Regardless of whether memcg accounts for hugetlb, the newly added global counter is updated and shown in /proc/vmstat. The global counter is added because vmstats is the preferred framework for cgroup stats. It makes stat items consistent between global and cgroups. It also provides a per-node breakdown, which is useful. Because it does not use cgroup-specific hooks, we also keep generic MM code separate from memcg code. [1] https://lore.kernel.org/all/20231006184629.155543-1-nphamcs@gmail.com/ [2] Of course, we can't make a new patch for every feature that can be duplicated. However, since the existing solution of enabling the hugeTLB controller is an imperfect solution that still leaves a discrepancy between memory.stat and memory.curent, I think that it is reasonable to isolate the feature in this case. Link: https://lkml.kernel.org/r/20241101204402.1885383-1-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Suggested-by: Nhat Pham Suggested-by: Shakeel Butt Suggested-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Chris Down Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Reviewed-by: Nhat Pham Cc: Jonathan Corbet Cc: Michal Koutný Cc: Muchun Song Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5e8f567753bd..b36124145a16 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,9 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, +#ifdef CONFIG_HUGETLB_PAGE + NR_HUGETLB, +#endif NR_VM_NODE_STAT_ITEMS }; -- cgit v1.2.3 From 4a530a7c751d27f9dbd70b7fc45670cd11713b13 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 11 Oct 2024 11:00:21 +0200 Subject: fs: prepare for "explicit connectable" file handles We would like to use the high 16bit of the handle_type field to encode file handle traits, such as "connectable". In preparation for this change, make sure that filesystems do not return a handle_type value with upper bits set and that the open_by_handle_at(2) syscall rejects these handle types. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20241011090023.655623-2-amir73il@gmail.com Fixes: 570df4e9c23f ("ceph: snapshot nfs re-export") Acked-by: Reviewed-by: Jan Kara Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 893a1d21dc1c..5e14d4500a75 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -160,6 +160,17 @@ struct fid { #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ +/* + * Filesystems use only lower 8 bits of file_handle type for fid_type. + * name_to_handle_at() uses upper 16 bits of type as user flags to be + * interpreted by open_by_handle_at(). + */ +#define FILEID_USER_FLAGS_MASK 0xffff0000 +#define FILEID_USER_FLAGS(type) ((type) & FILEID_USER_FLAGS_MASK) + +/* Flags supported in encoded handle_type that is exported to user */ +#define FILEID_VALID_USER_FLAGS (0) + /** * struct export_operations - for nfsd to communicate with file systems * @encode_fh: encode a file handle fragment from a dentry -- cgit v1.2.3 From c374196b2b9f4b803fccd59ed82f0712041e21e1 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 11 Oct 2024 11:00:22 +0200 Subject: fs: name_to_handle_at() support for "explicit connectable" file handles nfsd encodes "connectable" file handles for the subtree_check feature, which can be resolved to an open file with a connected path. So far, userspace nfs server could not make use of this functionality. Introduce a new flag AT_HANDLE_CONNECTABLE to name_to_handle_at(2). When used, the encoded file handle is "explicitly connectable". The "explicitly connectable" file handle sets bits in the high 16bit of the handle_type field, so open_by_handle_at(2) will know that it needs to open a file with a connected path. old kernels will now recognize the handle_type with high bits set, so "explicitly connectable" file handles cannot be decoded by open_by_handle_at(2) on old kernels. The flag AT_HANDLE_CONNECTABLE is not allowed together with either AT_HANDLE_FID or AT_EMPTY_PATH. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20241011090023.655623-3-amir73il@gmail.com Fixes: 570df4e9c23f ("ceph: snapshot nfs re-export") Acked-by: Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 5e14d4500a75..4ee42b2cf4ab 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -169,6 +169,8 @@ struct fid { #define FILEID_USER_FLAGS(type) ((type) & FILEID_USER_FLAGS_MASK) /* Flags supported in encoded handle_type that is exported to user */ +#define FILEID_IS_CONNECTABLE 0x10000 +#define FILEID_IS_DIR 0x20000 #define FILEID_VALID_USER_FLAGS (0) /** -- cgit v1.2.3 From a20853ab8296d4a8754482cb5e9adde8ab426a25 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 11 Oct 2024 11:00:23 +0200 Subject: fs: open_by_handle_at() support for decoding "explicit connectable" file handles Teach open_by_handle_at(2) about the type format of "explicit connectable" file handles that were created using the AT_HANDLE_CONNECTABLE flag to name_to_handle_at(2). When decoding an "explicit connectable" file handles, name_to_handle_at(2) should fail if it cannot open a "connected" fd with known path, which is accessible (to capable user) from mount fd path. Note that this does not check if the path is accessible to the calling user, just that it is accessible wrt the mount namesapce, so if there is no "connected" alias, or if parts of the path are hidden in the mount namespace, open_by_handle_at(2) will return -ESTALE. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20241011090023.655623-4-amir73il@gmail.com Fixes: 570df4e9c23f ("ceph: snapshot nfs re-export") Acked-by: Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 4ee42b2cf4ab..fcab6ab1d38a 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -171,7 +171,7 @@ struct fid { /* Flags supported in encoded handle_type that is exported to user */ #define FILEID_IS_CONNECTABLE 0x10000 #define FILEID_IS_DIR 0x20000 -#define FILEID_VALID_USER_FLAGS (0) +#define FILEID_VALID_USER_FLAGS (FILEID_IS_CONNECTABLE | FILEID_IS_DIR) /** * struct export_operations - for nfsd to communicate with file systems -- cgit v1.2.3 From 957860cbc1dc89f79f2acc193470224e350dfd03 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Nov 2024 07:14:03 -0700 Subject: block: make struct rq_list available for !CONFIG_BLOCK A previous commit changed how requests are linked in the plug structure, but unlike the previous method, it uses a new type for it rather than struct request. The latter is available even for !CONFIG_BLOCK, while struct rq_list is now. Move it outside CONFIG_BLOCK. Reported-by: Nathan Chancellor Fixes: a3396b99990d ("block: add a rq_list type") Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 00212e96261a..a1fd0ddce5cf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1006,12 +1006,12 @@ extern void blk_put_queue(struct request_queue *); void blk_mark_disk_dead(struct gendisk *disk); -#ifdef CONFIG_BLOCK struct rq_list { struct request *head; struct request *tail; }; +#ifdef CONFIG_BLOCK /* * blk_plug permits building a queue of related requests by holding the I/O * fragments for a short period. This allows merging of sequential requests -- cgit v1.2.3 From 7eb4e1dd71009472215bc1f8226ee9a041da8d37 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Tue, 12 Nov 2024 18:52:16 +0000 Subject: x86/efi: Drop support for the EFI_PROPERTIES_TABLE Drop support for the EFI_PROPERTIES_TABLE. It was a failed, short-lived experiment that broke the boot both on Linux and Windows, and was replaced by the EFI_MEMORY_ATTRIBUTES_TABLE shortly after. Suggested-by: Ard Biesheuvel Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index e28d88066033..e5815867aba9 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -379,7 +379,6 @@ void efi_native_runtime_setup(void); #define EFI_SYSTEM_RESOURCE_TABLE_GUID EFI_GUID(0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80) #define EFI_FILE_SYSTEM_GUID EFI_GUID(0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define DEVICE_TREE_GUID EFI_GUID(0xb1b621d5, 0xf19c, 0x41a5, 0x83, 0x0b, 0xd9, 0x15, 0x2c, 0x69, 0xaa, 0xe0) -#define EFI_PROPERTIES_TABLE_GUID EFI_GUID(0x880aaca3, 0x4adc, 0x4a04, 0x90, 0x79, 0xb7, 0x47, 0x34, 0x08, 0x25, 0xe5) #define EFI_RNG_PROTOCOL_GUID EFI_GUID(0x3152bca5, 0xeade, 0x433d, 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44) #define EFI_RNG_ALGORITHM_RAW EFI_GUID(0xe43176d7, 0xb6e8, 0x4827, 0xb7, 0x84, 0x7f, 0xfd, 0xc4, 0xb6, 0x85, 0x61) #define EFI_MEMORY_ATTRIBUTES_TABLE_GUID EFI_GUID(0xdcfa911d, 0x26eb, 0x469f, 0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20) @@ -580,15 +579,6 @@ struct efi_mem_range { u64 attribute; }; -typedef struct { - u32 version; - u32 length; - u64 memory_protection_attribute; -} efi_properties_table_t; - -#define EFI_PROPERTIES_TABLE_VERSION 0x00010000 -#define EFI_PROPERTIES_RUNTIME_MEMORY_PROTECTION_NON_EXECUTABLE_PE_DATA 0x1 - typedef struct { u16 version; u16 length; @@ -871,10 +861,9 @@ static inline int efi_range_is_wc(unsigned long start, unsigned long len) #define EFI_PARAVIRT 6 /* Access is via a paravirt interface */ #define EFI_ARCH_1 7 /* First arch-specific bit */ #define EFI_DBG 8 /* Print additional debug info at runtime */ -#define EFI_NX_PE_DATA 9 /* Can runtime data regions be mapped non-executable? */ -#define EFI_MEM_ATTR 10 /* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */ -#define EFI_MEM_NO_SOFT_RESERVE 11 /* Is the kernel configured to ignore soft reservations? */ -#define EFI_PRESERVE_BS_REGIONS 12 /* Are EFI boot-services memory segments available? */ +#define EFI_MEM_ATTR 9 /* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */ +#define EFI_MEM_NO_SOFT_RESERVE 10 /* Is the kernel configured to ignore soft reservations? */ +#define EFI_PRESERVE_BS_REGIONS 11 /* Are EFI boot-services memory segments available? */ #ifdef CONFIG_EFI /* -- cgit v1.2.3 From 83e041522eb9c45479f4490b212687cf1e7e9999 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:40 +0000 Subject: io_uring: temporarily disable registered waits Disable wait argument registration as it'll be replaced with a more generic feature. We'll still need IORING_ENTER_EXT_ARG_REG parsing in a few commits so leave it be. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/70b1d1d218c41ba77a76d1789c8641dab0b0563e.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 072e65e93105..52a5da99a205 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -330,14 +330,6 @@ struct io_ring_ctx { atomic_t cq_wait_nr; atomic_t cq_timeouts; struct wait_queue_head cq_wait; - - /* - * If registered with IORING_REGISTER_CQWAIT_REG, a single - * page holds N entries, mapped in cq_wait_arg. cq_wait_index - * is the maximum allowable index. - */ - struct io_uring_reg_wait *cq_wait_arg; - unsigned char cq_wait_index; } ____cacheline_aligned_in_smp; /* timeouts */ @@ -431,8 +423,6 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; - - struct page **cq_wait_page; }; struct io_tw_state { -- cgit v1.2.3 From dfbbfbf191878e8dd422768ce009858d8b5b761e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:41 +0000 Subject: io_uring: introduce concept of memory regions We've got a good number of mappings we share with the userspace, that includes the main rings, provided buffer rings, upcoming rings for zerocopy rx and more. All of them duplicate user argument parsing and some internal details as well (page pinnning, huge page optimisations, mmap'ing, etc.) Introduce a notion of regions. For userspace for now it's just a new structure called struct io_uring_region_desc which is supposed to parameterise all such mapping / queue creations. A region either represents a user provided chunk of memory, in which case the user_addr field should point to it, or a request for the kernel to allocate the memory, in which case the user would need to mmap it after using the offset returned in the mmap_offset field. With a uniform userspace API we can avoid additional boiler plate code and apply future optimisation to all of them at once. Internally, there is a new structure struct io_mapped_region holding all relevant runtime information and some helpers to work with it. This patch limits it to user provided regions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 52a5da99a205..1d3a37234ace 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -75,6 +75,12 @@ struct io_hash_table { unsigned hash_bits; }; +struct io_mapped_region { + struct page **pages; + void *vmap_ptr; + size_t nr_pages; +}; + /* * Arbitrary limit, can be raised if need be */ -- cgit v1.2.3 From 93238e66185524aad925acefb2312203b9e26d63 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:42 +0000 Subject: io_uring: add memory region registration Regions will serve multiple purposes. First, with it we can decouple ring/etc. object creation from registration / mapping of the memory they will be placed in. We already have hacks that allow to put both SQ and CQ into the same huge page, in the future we should be able to: region = create_region(io_ring); create_pbuf_ring(io_uring, region, offset=0); create_pbuf_ring(io_uring, region, offset=N); The second use case is efficiently passing parameters. The following patch enables back on top of regions IORING_ENTER_EXT_ARG_REG, which optimises wait arguments. It'll also be useful for request arguments replacing iovecs, msghdr, etc. pointers. Eventually it would also be handy for BPF as well if it comes to fruition. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0798cf3a14fad19cfc96fc9feca5f3e11481691d.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1d3a37234ace..e1d69123e164 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -429,6 +429,9 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; + + /* used for optimised request parameter and wait argument passing */ + struct io_mapped_region param_region; }; struct io_tw_state { -- cgit v1.2.3 From 96a30e469ca1d2b8cc7811b40911f8614b558241 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Nov 2024 16:13:03 -0800 Subject: bpf: use common instruction history across all states Instead of allocating and copying instruction history each time we enqueue child verifier state, switch to a model where we use one common dynamically sized array of instruction history entries across all states. The key observation for proving this is correct is that instruction history is only relevant while state is active, which means it either is a current state (and thus we are actively modifying instruction history and no other state can interfere with us) or we are checkpointed state with some children still active (either enqueued or being current). In the latter case our portion of instruction history is finalized and won't change or grow, so as long as we keep it immutable until the state is finalized, we are good. Now, when state is finalized and is put into state hash for potentially future pruning lookups, instruction history is not used anymore. This is because instruction history is only used by precision marking logic, and we never modify precision markings for finalized states. So, instead of each state having its own small instruction history, we keep a global dynamically-sized instruction history, where each state in current DFS path from root to active state remembers its portion of instruction history. Current state can append to this history, but cannot modify any of its parent histories. Async callback state enqueueing, while logically detached from parent state, still is part of verification backtracking tree, so has to follow the same schema as normal state checkpoints. Because the insn_hist array can be grown through realloc, states don't keep pointers, they instead maintain two indices, [start, end), into global instruction history array. End is exclusive index, so `start == end` means there is no relevant instruction history. This eliminates a lot of allocations and minimizes overall memory usage. For instance, running a worst-case test from [0] (but without the heuristics-based fix [1]), it took 12.5 minutes until we get -ENOMEM. With the changes in this patch the whole test succeeds in 10 minutes (very slow, so heuristics from [1] is important, of course). To further validate correctness, veristat-based comparison was performed for Meta production BPF objects and BPF selftests objects. In both cases there were no differences *at all* in terms of verdict or instruction and state counts, providing a good confidence in the change. Having this low-memory-overhead solution of keeping dynamic per-instruction history cheaply opens up some new possibilities, like keeping extra information for literally every single validated instruction. This will be used for simplifying precision backpropagation logic in follow up patches. [0] https://lore.kernel.org/bpf/20241029172641.1042523-2-eddyz87@gmail.com/ [1] https://lore.kernel.org/bpf/20241029172641.1042523-1-eddyz87@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20241115001303.277272-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6b7c91629176..f4290c179bee 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -334,7 +334,7 @@ struct bpf_func_state { #define MAX_CALL_FRAMES 8 -/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +/* instruction history flags, used in bpf_insn_hist_entry.flags field */ enum { /* instruction references stack slot through PTR_TO_STACK register; * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) @@ -352,7 +352,7 @@ enum { static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); -struct bpf_jmp_history_entry { +struct bpf_insn_hist_entry { u32 idx; /* insn idx can't be bigger than 1 million */ u32 prev_idx : 22; @@ -442,13 +442,14 @@ struct bpf_verifier_state { * See get_loop_entry() for more information. */ struct bpf_verifier_state *loop_entry; - /* jmp history recorded from first to last. - * backtracking is using it to go from last to first. - * For most states jmp_history_cnt is [0-3]. + /* Sub-range of env->insn_hist[] corresponding to this state's + * instruction history. + * Backtracking is using it to go from last to first. + * For most states instruction history is short, 0-3 instructions. * For loops can go up to ~40. */ - struct bpf_jmp_history_entry *jmp_history; - u32 jmp_history_cnt; + u32 insn_hist_start; + u32 insn_hist_end; u32 dfs_depth; u32 callback_unroll_depth; u32 may_goto_depth; @@ -738,7 +739,9 @@ struct bpf_verifier_env { int cur_stack; } cfg; struct backtrack_state bt; - struct bpf_jmp_history_entry *cur_hist_ent; + struct bpf_insn_hist_entry *insn_hist; + struct bpf_insn_hist_entry *cur_hist_ent; + u32 insn_hist_cap; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ -- cgit v1.2.3 From 906c508afdca3487c4273bfeda8abedc8e21047b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 15 Nov 2024 17:42:48 +0100 Subject: sysfs: attribute_group: allow registration of const bin_attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to constify instances of struct bin_attribute it has to be possible to add them to string attribute_group. The current type of the bin_attrs member however is not compatible with that. Introduce a union that allows registration of both const and non-const attributes to enable a piecewise transition. As both union member types are compatible no logic needs to be adapted. Technically it is now possible register a const struct bin_attribute and receive it as mutable pointer in the callbacks. This is a soundness issue. But this same soundness issue already exists today in sysfs_create_bin_file(). Also the struct definition and callback implementation are always closely linked and are meant to be moved to const in lockstep. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241115-b4-sysfs-const-bin_attr-group-v1-1-2c9bb12dfc48@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index d713a6445a62..0f2fcd244523 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -106,7 +106,10 @@ struct attribute_group { const struct bin_attribute *, int); struct attribute **attrs; - struct bin_attribute **bin_attrs; + union { + struct bin_attribute **bin_attrs; + const struct bin_attribute *const *bin_attrs_new; + }; }; #define SYSFS_PREALLOC 010000 -- cgit v1.2.3 From d617b3147d54c42351eac63b5398d4ddf4f4011b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Nov 2024 16:54:43 +0000 Subject: io_uring: restore back registered wait arguments Now we've got a more generic region registration API, place IORING_ENTER_EXT_ARG_REG and re-enable it. First, the user has to register a region with the IORING_MEM_REGION_REG_WAIT_ARG flag set. It can only be done for a ring in a disabled state, aka IORING_SETUP_R_DISABLED, to avoid races with already running waiters. With that we should have stable constant values for ctx->cq_wait_{size,arg} in io_get_ext_arg_reg() and hence no READ_ONCE required. The other API difference is that we're now passing byte offsets instead of indexes. The user _must_ align all offsets / pointers to the native word size, failing to do so might but not necessarily has to lead to a failure usually returned as -EFAULT. liburing will be hiding this details from users. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/81822c1b4ffbe8ad391b4f9ad1564def0d26d990.1731689588.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1d69123e164..aa5f5ea98076 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -324,6 +324,9 @@ struct io_ring_ctx { unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; unsigned cq_extra; + + void *cq_wait_arg; + size_t cq_wait_size; } ____cacheline_aligned_in_smp; /* -- cgit v1.2.3 From 9407f5c3ec10c155aae61fc4496a7c17a96907b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Nov 2024 16:28:51 -0800 Subject: srcu: Unconditionally record srcu_read_lock_lite() in ->srcu_reader_flavor Currently, srcu_read_lock_lite() uses the SRCU_READ_FLAVOR_LITE bit in ->srcu_reader_flavor to communicate to the grace-period processing in srcu_readers_active_idx_check() that the smp_mb() must be replaced by a synchronize_rcu(). Unfortunately, ->srcu_reader_flavor is not updated unless the kernel is built with CONFIG_PROVE_RCU=y. Therefore in all kernels built with CONFIG_PROVE_RCU=n, srcu_readers_active_idx_check() incorrectly uses smp_mb() instead of synchronize_rcu() for srcu_struct structures whose readers use srcu_read_lock_lite(). This commit therefore causes Tree SRCU srcu_read_lock_lite() to unconditionally update ->srcu_reader_flavor so that srcu_readers_active_idx_check() can make the correct choice. Reported-by: Neeraj Upadhyay Closes: https://lore.kernel.org/all/d07e8f4a-d5ff-4c8e-8e61-50db285c57e9@amd.com/ Fixes: c0f08d6b5a61 ("srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()") Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 8 +------- include/linux/srcutiny.h | 3 +++ include/linux/srcutree.h | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 56f83237de4d..08339eb8a01c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -183,12 +183,6 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) -void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -#else -#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#endif - /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing @@ -277,7 +271,7 @@ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + srcu_check_read_flavor_lite(ssp); retval = __srcu_read_lock_lite(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 4d96bbdb45f0..1321da803274 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -81,6 +81,9 @@ static inline void srcu_barrier(struct srcu_struct *ssp) synchronize_srcu(ssp); } +#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) +#define srcu_check_read_flavor_lite(ssp) do { } while (0) + /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 778eb61542e1..490aeecc6bb4 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -248,4 +248,25 @@ static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); + +// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels. +static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) +{ + struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + + if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) + return; + + // Note that the cmpxchg() in srcu_check_read_flavor() is fully ordered. + __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); +} + +// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. +static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) +{ + if (IS_ENABLED(CONFIG_PROVE_RCU)) + __srcu_check_read_flavor(ssp, read_flavor); +} + #endif -- cgit v1.2.3 From 41ffcd95015f18178ddc53fed919c2842d52fe38 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 14 Nov 2024 10:33:27 +0000 Subject: net: phy: fix phylib's dual eee_enabled phylib has two eee_enabled members. Some parts of the code are using phydev->eee_enabled, other parts are using phydev->eee_cfg.eee_enabled. This leads to incorrect behaviour as their state goes out of sync. ethtool --show-eee shows incorrect information, and --set-eee sometimes doesn't take effect. Fix this by only having one eee_enabled member - that in eee_cfg. Fixes: 49168d1980e2 ("net: phy: Add phy_support_eee() indicating MAC support EEE") Signed-off-by: Russell King (Oracle) Reviewed-by: Heiner Kallweit Link: https://patch.msgid.link/E1tBXAF-00341F-EQ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index a98bc91a0cde..44890cdf40a2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -601,7 +601,6 @@ struct macsec_ops; * @adv_old: Saved advertised while power saving for WoL * @supported_eee: supported PHY EEE linkmodes * @advertising_eee: Currently advertised EEE linkmodes - * @eee_enabled: Flag indicating whether the EEE feature is enabled * @enable_tx_lpi: When True, MAC should transmit LPI to PHY * @eee_cfg: User configuration of EEE * @lp_advertising: Current link partner advertised linkmodes @@ -721,7 +720,6 @@ struct phy_device { /* used for eee validation and configuration*/ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); - bool eee_enabled; /* Host supported PHY interface types. Should be ignored if empty. */ DECLARE_PHY_INTERFACE_MASK(host_interfaces); -- cgit v1.2.3 From 221a9c1df790fa711d65daf5ba05d0addc279153 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 14 Nov 2024 03:00:11 -0800 Subject: net: netpoll: Individualize the skb pool The current implementation of the netpoll system uses a global skb pool, which can lead to inefficient memory usage and waste when targets are disabled or no longer in use. This can result in a significant amount of memory being unnecessarily allocated and retained, potentially causing performance issues and limiting the availability of resources for other system components. Modify the netpoll system to assign a skb pool to each target instead of using a global one. This approach allows for more fine-grained control over memory allocation and deallocation, ensuring that resources are only allocated and retained as needed. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20241114-skb_buffers_v2-v3-1-9be9f52a8b69@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index cd4e28db0cbd..77635b885c18 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -32,6 +32,7 @@ struct netpoll { bool ipv6; u16 local_port, remote_port; u8 remote_mac[ETH_ALEN]; + struct sk_buff_head skb_pool; }; struct netpoll_info { -- cgit v1.2.3 From 4b42fbc6bd8f73d9ded535d8c61ccaa837ff3bd4 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:53 +0100 Subject: ndo_fdb_add: Add a parameter to report whether notification was sent Currently when FDB entries are added to or deleted from a VXLAN netdevice, the VXLAN driver emits one notification, including the VXLAN-specific attributes. The core however always sends a notification as well, a generic one. Thus two notifications are unnecessarily sent for these operations. A similar situation comes up with bridge driver, which also emits notifications on its own: # ip link add name vx type vxlan id 1000 dstport 4789 # bridge monitor fdb & [1] 1981693 # bridge fdb add de:ad:be:ef:13:37 dev vx self dst 192.0.2.1 de:ad:be:ef:13:37 dev vx dst 192.0.2.1 self permanent de:ad:be:ef:13:37 dev vx self permanent In order to prevent this duplicity, add a paremeter to ndo_fdb_add, bool *notified. The flag is primed to false, and if the callee sends a notification on its own, it sets it to true, thus informing the core that it should not generate another notification. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/cbf6ae8195e85cbf922f8058ce4eba770f3b71ed.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0aae346d919e..6a7fd191e1ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1248,8 +1248,10 @@ struct netdev_net_notifier { * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid, u16 flags, - * struct netlink_ext_ack *extack); + * bool *notified, struct netlink_ext_ack *extack); * Adds an FDB entry to dev for addr. + * Callee shall set *notified to true if it sent any appropriate + * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid) @@ -1525,6 +1527,7 @@ struct net_device_ops { const unsigned char *addr, u16 vid, u16 flags, + bool *notified, struct netlink_ext_ack *extack); int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], -- cgit v1.2.3 From 42575ad5aab932273475d1ec3e7881cb5a05420e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:54 +0100 Subject: ndo_fdb_del: Add a parameter to report whether notification was sent In a similar fashion to ndo_fdb_add, which was covered in the previous patch, add the bool *notified argument to ndo_fdb_del. Callees that send a notification on their own set the flag to true. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/06b1acf4953ef0a5ed153ef1f32d7292044f2be6.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a7fd191e1ee..ecc686409161 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1254,8 +1254,11 @@ struct netdev_net_notifier { * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, - * const unsigned char *addr, u16 vid) + * const unsigned char *addr, u16 vid + * bool *notified, struct netlink_ext_ack *extack); * Deletes the FDB entry from dev corresponding to addr. + * Callee shall set *notified to true if it sent any appropriate + * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, @@ -1533,7 +1536,9 @@ struct net_device_ops { struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid, struct netlink_ext_ack *extack); + u16 vid, + bool *notified, + struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 3ef66af31feaf5ff5dd73e63b1327789822ed476 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:20 +0800 Subject: virtio_ring: introduce add api for premapped Two APIs are introduced to submit premapped per-buffers. int virtqueue_add_inbuf_premapped(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, void *ctx, gfp_t gfp); int virtqueue_add_outbuf_premapped(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, gfp_t gfp); Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-6-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- include/linux/virtio.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 306137a15d07..13b3f55abca3 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -56,6 +56,17 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, void *ctx, gfp_t gfp); +int virtqueue_add_inbuf_premapped(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + void *ctx, + gfp_t gfp); + +int virtqueue_add_outbuf_premapped(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp); + int virtqueue_add_sgs(struct virtqueue *vq, struct scatterlist *sgs[], unsigned int out_sgs, -- cgit v1.2.3 From 880ebcbe06636c42cfb328039581e177c6cd6ba5 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:22 +0800 Subject: virtio_ring: remove API virtqueue_set_dma_premapped Now, this API is useless. remove it. Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-8-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- include/linux/virtio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 13b3f55abca3..338e0f5efb4b 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -93,8 +93,6 @@ bool virtqueue_enable_cb(struct virtqueue *vq); unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq); -int virtqueue_set_dma_premapped(struct virtqueue *_vq); - bool virtqueue_poll(struct virtqueue *vq, unsigned); bool virtqueue_enable_cb_delayed(struct virtqueue *vq); -- cgit v1.2.3 From 665745f274870c921020f610e2c99a3b1613519b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 18 Oct 2024 17:47:52 +0300 Subject: PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This mostly reverts the commit b4c7d2076b4e ("PCI/LINK: Remove bandwidth notification"). An upcoming commit extends this driver building PCIe bandwidth controller on top of it. PCIe bandwidth notifications were first added in the commit e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification") but later had to be removed. The significant changes compared with the old bandwidth notification driver include: 1) Don't print the notifications into kernel log, just keep the Link Speed cached in struct pci_bus updated. While somewhat unfortunate, the log spam was the source of complaints that eventually lead to the removal of the bandwidth notifications driver (see the links below for further information). 2) Besides the Link Bandwidth Management Interrupt, also enable Link Autonomous Bandwidth Interrupt to cover the other source of bandwidth changes. 3) Handle Link Speed updates robustly. Refresh the cached Link Speed when enabling Bandwidth Notification Interrupts, and solve the race between Link Speed read and LBMS/LABS update in pcie_bwnotif_irq_thread(). 4) Use concurrency safe LNKCTL RMW operations. 5) The driver is now called PCIe bwctrl (bandwidth controller) instead of just bandwidth notifications because of increased scope and functionality within the driver. 6) Coexist with the Target Link Speed quirk in pcie_failed_link_retrain(). Provide LBMS counting API for it. 7) Tweaks to variable/functions names for consistency and length reasons. Bandwidth Notifications enable the cur_bus_speed in the struct pci_bus to keep track PCIe Link Speed changes. [bhelgaas: This is based on previous work by Alexandru Gagniuc ; see e8303bb7a75c ("PCI/LINK: Report degraded links via link bandwidth notification")] Link: https://lore.kernel.org/r/20241018144755.7875-7-ilpo.jarvinen@linux.intel.com Link: https://lore.kernel.org/all/20190429185611.121751-1-helgaas@kernel.org/ Link: https://lore.kernel.org/linux-pci/20190501142942.26972-1-keith.busch@intel.com/ Link: https://lore.kernel.org/linux-pci/20200115221008.GA191037@google.com/ Suggested-by: Lukas Wunner # Building bwctrl on top of bwnotif Signed-off-by: Ilpo Järvinen [bhelgaas: squash fix to drop IRQF_ONESHOT and convert to hardirq handler: https://lore.kernel.org/r/20241115165717.15233-1-ilpo.jarvinen@linux.intel.com] Signed-off-by: Bjorn Helgaas Tested-by: Stefan Wahren Reviewed-by: Jonathan Cameron --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 99c6fa30d25b..579b28833429 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -313,6 +313,7 @@ struct pci_vpd { }; struct irq_affinity; +struct pcie_bwctrl_data; struct pcie_link_state; struct pci_sriov; struct pci_p2pdma; @@ -502,6 +503,7 @@ struct pci_dev { unsigned int dpc_rp_extensions:1; u8 dpc_rp_log_size; #endif + struct pcie_bwctrl_data *link_bwctrl; #ifdef CONFIG_PCI_ATS union { struct pci_sriov *sriov; /* PF: SR-IOV info */ -- cgit v1.2.3 From de9a6c8d5dbfedb5eb3722c822da0490f6a59a45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 18 Oct 2024 17:47:53 +0300 Subject: PCI/bwctrl: Add pcie_set_target_speed() to set PCIe Link Speed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, PCIe Link Speeds are adjusted by custom code rather than in a common function provided in PCI core. The PCIe bandwidth controller (bwctrl) introduces an in-kernel API, pcie_set_target_speed(), to set PCIe Link Speed. Convert Target Speed quirk to use the new API. The Target Speed quirk runs very early when bwctrl is not yet probed for a Port and can also run later when bwctrl is already setup for the Port, which requires the per port mutex (set_speed_mutex) to be only taken if the bwctrl setup is already complete. The new API is also intended to be used in an upcoming commit that adds a thermal cooling device to throttle PCIe bandwidth when thermal thresholds are reached. The PCIe bandwidth control procedure is as follows. The highest speed supported by the Port and the PCIe device which is not higher than the requested speed is selected and written into the Target Link Speed in the Link Control 2 Register. Then bandwidth controller retrains the PCIe Link. Bandwidth Notifications enable the cur_bus_speed in the struct pci_bus to keep track PCIe Link Speed changes. While Bandwidth Notifications should also be generated when bandwidth controller alters the PCIe Link Speed, a few platforms do not deliver LMBS interrupt after Link Training as expected. Thus, after changing the Link Speed, bandwidth controller makes additional read for the Link Status Register to ensure cur_bus_speed is consistent with the new PCIe Link Speed. Link: https://lore.kernel.org/r/20241018144755.7875-8-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: squash devm_mutex_init() error checking from https://lore.kernel.org/r/20241030163139.2111689-1-andriy.shevchenko@linux.intel.com, drop export of pcie_set_target_speed()] Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron --- include/linux/pci.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 579b28833429..4f02d9f91399 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1797,9 +1797,19 @@ static inline int pci_irqd_intx_xlate(struct irq_domain *d, #ifdef CONFIG_PCIEPORTBUS extern bool pcie_ports_disabled; extern bool pcie_ports_native; + +int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req, + bool use_lt); #else #define pcie_ports_disabled true #define pcie_ports_native false + +static inline int pcie_set_target_speed(struct pci_dev *port, + enum pci_bus_speed speed_req, + bool use_lt) +{ + return -EOPNOTSUPP; +} #endif #define PCIE_LINK_STATE_L0S (BIT(0) | BIT(1)) /* Upstr/dwnstr L0s */ -- cgit v1.2.3 From d278b098282d1327f6e1be82aacb18457a4d244d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 18 Oct 2024 17:47:54 +0300 Subject: thermal: Add PCIe cooling driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a thermal cooling driver to provide path to access PCIe bandwidth controller using the usual thermal interfaces. A cooling device is instantiated for controllable PCIe Ports from the bwctrl service driver. If registering the cooling device fails, allow bwctrl's probe to succeed regardless. As cdev in that case contains IS_ERR() pseudo "pointer", clean that up inside the probe function so the remove side doesn't need to suddenly make an odd looking IS_ERR() check. The thermal side state 0 means no throttling, i.e., maximum supported PCIe Link Speed. Link: https://lore.kernel.org/r/20241018144755.7875-9-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: dropped data->cdev test per https://lore.kernel.org/r/ZzRm1SJTwEMRsAr8@wunner.de] Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Acked-by: Rafael J. Wysocki # From the cooling device interface perspective --- include/linux/pci-bwctrl.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 include/linux/pci-bwctrl.h (limited to 'include/linux') diff --git a/include/linux/pci-bwctrl.h b/include/linux/pci-bwctrl.h new file mode 100644 index 000000000000..cee07127455b --- /dev/null +++ b/include/linux/pci-bwctrl.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * PCIe bandwidth controller + * + * Copyright (C) 2023-2024 Intel Corporation + */ + +#ifndef LINUX_PCI_BWCTRL_H +#define LINUX_PCI_BWCTRL_H + +#include + +struct thermal_cooling_device; + +#ifdef CONFIG_PCIE_THERMAL +struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port); +void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev); +#else +static inline struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port) +{ + return NULL; +} +static inline void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev) +{ +} +#endif + +#endif -- cgit v1.2.3 From d7a516c6eeae29144649f1c3f586fa580ec9e040 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 15 Nov 2024 21:46:02 +0100 Subject: compiler.h: Fix undefined BUILD_BUG_ON_ZERO() defines __must_be_array() and __must_be_cstr() and both expand to BUILD_BUG_ON_ZERO(), but defines BUILD_BUG_ON_ZERO(). Including in would create a cyclic dependency as already includes . Fix that by defining __BUILD_BUG_ON_ZERO_MSG() in and using that for __must_be_array() and __must_be_cstr(). Signed-off-by: Philipp Reisner Link: https://lore.kernel.org/r/20241115204602.249590-1-philipp.reisner@linbit.com Signed-off-by: Kees Cook --- include/linux/compiler.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 4d4e23b6e3e7..469a64dd6495 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -239,11 +239,18 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ +#ifdef __CHECKER__ +#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) +#else /* __CHECKER__ */ +#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) +#endif /* __CHECKER__ */ + /* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") /* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ -#define __must_be_cstr(p) BUILD_BUG_ON_ZERO(__annotated(p, nonstring)) +#define __must_be_cstr(p) \ + __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") /* * This returns a constant expression while determining if an argument is -- cgit v1.2.3 From dab78a1745ab3c6001e1e4d50a9d09efef8e260d Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:05 +0800 Subject: net/udp: Add 4-tuple hash list basis Add a new hash list, hash4, in udp table. It will be used to implement 4-tuple hash for connected udp sockets. This patch adds the hlist to table, and implements helpers and the initialization. 4-tuple hash is implemented in the following patch. hash4 uses hlist_nulls to avoid moving wrongly onto another hlist due to concurrent rehash, because rehash() can happen with lookup(). Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/udp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 3eb3f2b9a2a0..0807e21cfec9 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -56,6 +56,12 @@ struct udp_sock { int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + /* For UDP 4-tuple hash */ + __u16 udp_lrpa_hash; + struct hlist_nulls_node udp_lrpa_node; +#endif + /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -206,6 +212,11 @@ static inline void udp_allow_gso(struct sock *sk) #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) +#if !IS_ENABLED(CONFIG_BASE_SMALL) +#define udp_lrpa_for_each_entry_rcu(__up, node, list) \ + hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node) +#endif + #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) #endif /* _LINUX_UDP_H */ -- cgit v1.2.3 From a06e4a93067cd8f55a74638d45146ddde76574f2 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Wed, 13 Nov 2024 09:32:15 +1100 Subject: rtc: m48t59: Use platform_data struct for year offset value Instead of hard-coded values and ifdefs, store the year offset in the platform_data struct. Tested-by: Daniel Palmer Reviewed-by: Geert Uytterhoeven Signed-off-by: Finn Thain Tested-by: Andreas Larsson Acked-by: Andreas Larsson Link: https://lore.kernel.org/r/665c3526184a8d0c4a6373297d8e7d9a12591d8b.1731450735.git.fthain@linux-m68k.org Signed-off-by: Alexandre Belloni --- include/linux/rtc/m48t59.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtc/m48t59.h b/include/linux/rtc/m48t59.h index 9465d5405fe2..373ba77071c6 100644 --- a/include/linux/rtc/m48t59.h +++ b/include/linux/rtc/m48t59.h @@ -56,6 +56,9 @@ struct m48t59_plat_data { void __iomem *ioaddr; /* offset to RTC registers, automatically set according to the type */ unsigned int offset; + + /* YY digits (in RTC) are offset, i.e. year is 1900 + yy_offset + YY */ + int yy_offset; }; #endif /* _LINUX_RTC_M48T59_H_ */ -- cgit v1.2.3 From 03854920c39c62b88c0b540c92cf35746d059af2 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 6 Oct 2024 02:19:52 +0100 Subject: libceph: Remove unused ceph_pagelist functions ceph_pagelist_truncate() and ceph_pagelist_set_cursor() have been unused since commit 39be95e9c8c0 ("ceph: ceph_pagelist_append might sleep while atomic") Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/pagelist.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index 5dead8486fd8..879bec0863aa 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -17,12 +17,6 @@ struct ceph_pagelist { refcount_t refcnt; }; -struct ceph_pagelist_cursor { - struct ceph_pagelist *pl; /* pagelist, for error checking */ - struct list_head *page_lru; /* page in list */ - size_t room; /* room remaining to reset to */ -}; - struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags); extern void ceph_pagelist_release(struct ceph_pagelist *pl); @@ -33,12 +27,6 @@ extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space); extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl); -extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, - struct ceph_pagelist_cursor *c); - -extern int ceph_pagelist_truncate(struct ceph_pagelist *pl, - struct ceph_pagelist_cursor *c); - static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) { __le64 ev = cpu_to_le64(v); -- cgit v1.2.3 From ee1eb8ccaab8cc2ef4bda8e11a40409ee20f6405 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 6 Oct 2024 02:19:53 +0100 Subject: libceph: Remove unused pagevec functions ceph_copy_user_to_page_vector() has been unused since 2013's commit e8344e668915 ("ceph: Implement writev/pwritev for sync operation.") ceph_copy_to_page_vector() has been unused since 2012's commit 913d2fdcf605 ("rbd: always pass ops array to rbd_req_sync_op()") Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 15fb566d3f46..733e7f93db66 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -317,12 +317,6 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages); extern void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); -extern int ceph_copy_user_to_page_vector(struct page **pages, - const void __user *data, - loff_t off, size_t len); -extern void ceph_copy_to_page_vector(struct page **pages, - const void *data, - loff_t off, size_t len); extern void ceph_copy_from_page_vector(struct page **pages, void *data, loff_t off, size_t len); -- cgit v1.2.3 From 32844fd72b879d02f1f6b4394025349d31a09fd3 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 6 Oct 2024 02:19:54 +0100 Subject: libceph: Remove unused ceph_osdc_watch_check ceph_osdc_watch_check() has been unused since it was added in commit b07d3c4bd727 ("libceph: support for checking on status of watch") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d7941478158c..d55b30057a45 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -626,8 +626,6 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, u32 timeout, struct page ***preply_pages, size_t *preply_len); -int ceph_osdc_watch_check(struct ceph_osd_client *osdc, - struct ceph_osd_linger_request *lreq); int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, -- cgit v1.2.3 From 979c6342f9c0a48696a6420f14f9dd409591657f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 15 Nov 2024 13:41:21 -0800 Subject: nvme-pci: add support for sgl metadata Supporting this mode allows creating and merging multi-segment metadata requests that wouldn't be possible otherwise. It also allows directly using user space requests that straddle physically discontiguous pages. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 0a6e22038ce3..5873ce859cc8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -389,6 +389,7 @@ enum { NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, + NVME_CTRL_SGLS_MSDS = 1 << 19, }; struct nvme_lbaf { -- cgit v1.2.3 From 6399a0db8cd61eedbfb4b7809a4f4699157a9bf8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Nov 2024 13:57:04 -0800 Subject: nvme: define the remaining used sgls constants This provides a little more context when reading the code than hardcoded magic numbers. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5873ce859cc8..2baf9a80b470 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -389,7 +389,11 @@ enum { NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, + NVME_CTRL_SGLS_BYTE_ALIGNED = 1, + NVME_CTRL_SGLS_DWORD_ALIGNED = 2, + NVME_CTRL_SGLS_KSDBDS = 1 << 2, NVME_CTRL_SGLS_MSDS = 1 << 19, + NVME_CTRL_SGLS_SAOS = 1 << 20, }; struct nvme_lbaf { -- cgit v1.2.3 From 28b6acd75e3cefbe746ec7402c7ff4fdb114f327 Mon Sep 17 00:00:00 2001 From: Rick Wertenbroek Date: Thu, 14 Nov 2024 17:10:32 +0100 Subject: PCI: endpoint: Fix pci_epc_map map_size kerneldoc string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because some endpoint controllers have requirements on the alignment of the controller physical memory address that must be used to map a RC PCI address region, the map PCI start address is not necessarily the desired PCI base address to be mapped. This can result in map_pci_addr being lower than pci_addr as documented. This results in map_size covering the range map_pci_addr..pci_addr+pci_size. The old text had the pci_addr twice instead of map_pci_addr..pci_addr, so replace the erroneous kerneldoc string to reflect the actual range. Link: https://lore.kernel.org/r/20241114161032.3046202-1-rick.wertenbroek@gmail.com Signed-off-by: Rick Wertenbroek Signed-off-by: Krzysztof Wilczyński --- include/linux/pci-epc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index de8cc3658220..e818e3fdcded 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -40,7 +40,7 @@ pci_epc_interface_string(enum pci_epc_interface_type type) * @map_pci_addr: RC PCI address used as the first address mapped (may be lower * than @pci_addr) * @map_size: size of the controller memory needed for mapping the RC PCI address - * range @pci_addr..@pci_addr+@pci_size + * range @map_pci_addr..@pci_addr+@pci_size * @phys_base: base physical address of the allocated EPC memory for mapping the * RC PCI address range * @phys_addr: physical address at which @pci_addr is mapped -- cgit v1.2.3 From 2f746e40e9baae878f8193e09d8f6d601dce42bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2024 09:36:27 -0400 Subject: lockd: Remove unused typedef Clean up: Looks like the last usage of this typedef was removed by commit 026fec7e7c47 ("sunrpc: properly type pc_decode callbacks") in 2017. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/lockd/xdr.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 80cca9426761..17d53165d9f2 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -73,8 +73,6 @@ struct nlm_args { u32 fsm_mode; }; -typedef struct nlm_args nlm_args; - /* * Generic lockd result */ -- cgit v1.2.3 From 8994a512e2598c0bf1b6e9ece1e8292976b0dd65 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 17 Oct 2024 09:36:30 -0400 Subject: lockd: Remove unused parameter to nlmsvc_testlock() The nlm_cookie parameter has been unused since commit 09802fd2a8ca ("lockd: rip out deferred lock handling from testlock codepath"). Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 61c4b9c41904..c8f0f9458f2c 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -278,9 +278,9 @@ __be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, int, struct nlm_cookie *, int); __be32 nlmsvc_unlock(struct net *net, struct nlm_file *, struct nlm_lock *); -__be32 nlmsvc_testlock(struct svc_rqst *, struct nlm_file *, - struct nlm_host *, struct nlm_lock *, - struct nlm_lock *, struct nlm_cookie *); +__be32 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, + struct nlm_host *host, struct nlm_lock *lock, + struct nlm_lock *conflock); __be32 nlmsvc_cancel_blocked(struct net *net, struct nlm_file *, struct nlm_lock *); void nlmsvc_retry_blocked(struct svc_rqst *rqstp); void nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, -- cgit v1.2.3 From c840b8e1f039e90f97ca55525667eb961422f86c Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 13 Nov 2024 22:59:38 -0500 Subject: nfs_common: must not hold RCU while calling nfsd_file_put_local Move holding the RCU from nfs_to_nfsd_file_put_local to nfs_to_nfsd_net_put. It is the call to nfs_to->nfsd_serv_put that requires the RCU anyway (the puts for nfsd_file and netns were combined to avoid an extra indirect reference but that micro-optimization isn't possible now). This fixes xfstests generic/013 and it triggering: "Voluntary context switch within RCU read-side critical section!" [ 143.545738] Call Trace: [ 143.546206] [ 143.546625] ? show_regs+0x6d/0x80 [ 143.547267] ? __warn+0x91/0x140 [ 143.547951] ? rcu_note_context_switch+0x496/0x5d0 [ 143.548856] ? report_bug+0x193/0x1a0 [ 143.549557] ? handle_bug+0x63/0xa0 [ 143.550214] ? exc_invalid_op+0x1d/0x80 [ 143.550938] ? asm_exc_invalid_op+0x1f/0x30 [ 143.551736] ? rcu_note_context_switch+0x496/0x5d0 [ 143.552634] ? wakeup_preempt+0x62/0x70 [ 143.553358] __schedule+0xaa/0x1380 [ 143.554025] ? _raw_spin_unlock_irqrestore+0x12/0x40 [ 143.554958] ? try_to_wake_up+0x1fe/0x6b0 [ 143.555715] ? wake_up_process+0x19/0x20 [ 143.556452] schedule+0x2e/0x120 [ 143.557066] schedule_preempt_disabled+0x19/0x30 [ 143.557933] rwsem_down_read_slowpath+0x24d/0x4a0 [ 143.558818] ? xfs_efi_item_format+0x50/0xc0 [xfs] [ 143.559894] down_read+0x4e/0xb0 [ 143.560519] xlog_cil_commit+0x1b2/0xbc0 [xfs] [ 143.561460] ? _raw_spin_unlock+0x12/0x30 [ 143.562212] ? xfs_inode_item_precommit+0xc7/0x220 [xfs] [ 143.563309] ? xfs_trans_run_precommits+0x69/0xd0 [xfs] [ 143.564394] __xfs_trans_commit+0xb5/0x330 [xfs] [ 143.565367] xfs_trans_roll+0x48/0xc0 [xfs] [ 143.566262] xfs_defer_trans_roll+0x57/0x100 [xfs] [ 143.567278] xfs_defer_finish_noroll+0x27a/0x490 [xfs] [ 143.568342] xfs_defer_finish+0x1a/0x80 [xfs] [ 143.569267] xfs_bunmapi_range+0x4d/0xb0 [xfs] [ 143.570208] xfs_itruncate_extents_flags+0x13d/0x230 [xfs] [ 143.571353] xfs_free_eofblocks+0x12e/0x190 [xfs] [ 143.572359] xfs_file_release+0x12d/0x140 [xfs] [ 143.573324] __fput+0xe8/0x2d0 [ 143.573922] __fput_sync+0x1d/0x30 [ 143.574574] nfsd_filp_close+0x33/0x60 [nfsd] [ 143.575430] nfsd_file_free+0x96/0x150 [nfsd] [ 143.576274] nfsd_file_put+0xf7/0x1a0 [nfsd] [ 143.577104] nfsd_file_put_local+0x18/0x30 [nfsd] [ 143.578070] nfs_close_local_fh+0x101/0x110 [nfs_localio] [ 143.579079] __put_nfs_open_context+0xc9/0x180 [nfs] [ 143.580031] nfs_file_clear_open_context+0x4a/0x60 [nfs] [ 143.581038] nfs_file_release+0x3e/0x60 [nfs] [ 143.581879] __fput+0xe8/0x2d0 [ 143.582464] __fput_sync+0x1d/0x30 [ 143.583108] __x64_sys_close+0x41/0x80 [ 143.583823] x64_sys_call+0x189a/0x20d0 [ 143.584552] do_syscall_64+0x64/0x170 [ 143.585240] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 143.586185] RIP: 0033:0x7f3c5153efd7 Fixes: 65f2a5c36635 ("nfs_common: fix race in NFS calls to nfsd_file_put_local() and nfsd_serv_put()") Signed-off-by: Mike Snitzer Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/nfslocalio.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 3982fea79919..9202f4b24343 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -55,7 +55,7 @@ struct nfsd_localio_operations { const struct cred *, const struct nfs_fh *, const fmode_t); - void (*nfsd_file_put_local)(struct nfsd_file *); + struct net *(*nfsd_file_put_local)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; @@ -66,7 +66,7 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, struct rpc_clnt *, const struct cred *, const struct nfs_fh *, const fmode_t); -static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) +static inline void nfs_to_nfsd_net_put(struct net *net) { /* * Once reference to nfsd_serv is dropped, NFSD could be @@ -74,10 +74,22 @@ static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) * by always taking RCU. */ rcu_read_lock(); - nfs_to->nfsd_file_put_local(localio); + nfs_to->nfsd_serv_put(net); rcu_read_unlock(); } +static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) +{ + /* + * Must not hold RCU otherwise nfsd_file_put() can easily trigger: + * "Voluntary context switch within RCU read-side critical section!" + * by scheduling deep in underlying filesystem (e.g. XFS). + */ + struct net *net = nfs_to->nfsd_file_put_local(localio); + + nfs_to_nfsd_net_put(net); +} + #else /* CONFIG_NFS_LOCALIO */ static inline void nfsd_localio_ops_init(void) { -- cgit v1.2.3 From 1cfb5e57886aa69a992ff0ebd32e4651eb0fc995 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 Nov 2024 12:43:03 -0800 Subject: Revert "net: ethtool: Avoid thousands of -Wflex-array-member-not-at-end warnings" This reverts commit 3bd9b9abdf1563a22041b7255baea6d449902f1a. We cannot use the new tagged struct group because it throws C++ errors even under "extern C". Signed-off-by: Kees Cook Link: https://patch.msgid.link/20241115204308.3821419-1-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 299280c94d07..b8b935b52603 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -211,7 +211,7 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); * fields, but they are allowed to overwrite them (will be ignored). */ struct ethtool_link_ksettings { - struct ethtool_link_settings_hdr base; + struct ethtool_link_settings base; struct { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); -- cgit v1.2.3 From a57d5a72f8dec7db8a79d0016fb0a3bdecc82b56 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 18 Nov 2024 03:15:18 -0800 Subject: netpoll: Use rcu_access_pointer() in netpoll_poll_lock The ndev->npinfo pointer in netpoll_poll_lock() is RCU-protected but is being accessed directly for a NULL check. While no RCU read lock is held in this context, we should still use proper RCU primitives for consistency and correctness. Replace the direct NULL check with rcu_access_pointer(), which is the appropriate primitive when only checking for NULL without dereferencing the pointer. This function provides the necessary ordering guarantees without requiring RCU read-side protection. Fixes: bea3348eef27 ("[NET]: Make NAPI polling independent of struct net_device objects.") Signed-off-by: Breno Leitao Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/20241118-netpoll_rcu-v1-2-a1888dcb4a02@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index cd4e28db0cbd..959a4daacea1 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -72,7 +72,7 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi) { struct net_device *dev = napi->dev; - if (dev && dev->npinfo) { + if (dev && rcu_access_pointer(dev->npinfo)) { int owner = smp_processor_id(); while (cmpxchg(&napi->poll_owner, -1, owner) != -1) -- cgit v1.2.3 From 50f42c489528566ea2d8a9561ee54748b0441d51 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 26 Jun 2024 09:49:10 -0400 Subject: ceph: correct ceph_mds_cap_item field name The issue_seq is sent with bulk cap releases, not the current sequence number. See also ceph.git commit 655cddb7c9f3 ("include/ceph_fs: correct ceph_mds_cap_item field name"). Link: https://tracker.ceph.com/issues/66704 Signed-off-by: Patrick Donnelly Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index ee1d0e5f9789..4ff3ad5e9210 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -822,7 +822,7 @@ struct ceph_mds_cap_release { struct ceph_mds_cap_item { __le64 ino; __le64 cap_id; - __le32 migrate_seq, seq; + __le32 migrate_seq, issue_seq; } __attribute__ ((packed)); #define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ -- cgit v1.2.3 From 8b41ac43c7bb902786eae09cc23bcc9964ef2386 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 26 Jun 2024 12:57:27 -0400 Subject: ceph: correct ceph_mds_cap_peer field name The peer seq is used as the issue_seq. Use that name for consistency. See also ceph.git commit 1da6ef237fc7 ("include/ceph_fs: correct ceph_mds_cap_peer field name"). Link: https://tracker.ceph.com/issues/66704 Signed-off-by: Patrick Donnelly Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 4ff3ad5e9210..2d7d86f0290d 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -808,7 +808,7 @@ struct ceph_mds_caps { struct ceph_mds_cap_peer { __le64 cap_id; - __le32 seq; + __le32 issue_seq; __le32 mseq; __le32 mds; __u8 flags; -- cgit v1.2.3 From 46fd48ab3ea3eb3bb215684bd66ea3d260b091a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 08:26:02 +0100 Subject: block: return unsigned int from bdev_io_min The underlying limit is defined as an unsigned int, so return that from bdev_io_min as well. Fixes: ac481c20ef8f ("block: Topology ioctls") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: John Garry Link: https://lore.kernel.org/r/20241119072602.1059488-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a1fd0ddce5cf..195db38fda16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1261,7 +1261,7 @@ static inline unsigned int queue_io_min(const struct request_queue *q) return q->limits.io_min; } -static inline int bdev_io_min(struct block_device *bdev) +static inline unsigned int bdev_io_min(struct block_device *bdev) { return queue_io_min(bdev_get_queue(bdev)); } -- cgit v1.2.3 From f06e108a3dc53c0f5234d18de0bd224753db5019 Mon Sep 17 00:00:00 2001 From: Jan Hendrik Farr Date: Tue, 29 Oct 2024 15:00:36 +0100 Subject: Compiler Attributes: disable __counted_by for clang < 19.1.3 This patch disables __counted_by for clang versions < 19.1.3 because of the two issues listed below. It does this by introducing CONFIG_CC_HAS_COUNTED_BY. 1. clang < 19.1.2 has a bug that can lead to __bdos returning 0: https://github.com/llvm/llvm-project/pull/110497 2. clang < 19.1.3 has a bug that can lead to __bdos being off by 4: https://github.com/llvm/llvm-project/pull/112636 Fixes: c8248faf3ca2 ("Compiler Attributes: counted_by: Adjust name and identifier expansion") Cc: stable@vger.kernel.org # 6.6.x: 16c31dd7fdf6: Compiler Attributes: counted_by: bump min gcc version Cc: stable@vger.kernel.org # 6.6.x: 2993eb7a8d34: Compiler Attributes: counted_by: fixup clang URL Cc: stable@vger.kernel.org # 6.6.x: 231dc3f0c936: lkdtm/bugs: Improve warning message for compilers without counted_by support Cc: stable@vger.kernel.org # 6.6.x Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/all/20240913164630.GA4091534@thelio-3990X/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202409260949.a1254989-oliver.sang@intel.com Link: https://lore.kernel.org/all/Zw8iawAF5W2uzGuh@archlinux/T/#m204c09f63c076586a02d194b87dffc7e81b8de7b Suggested-by: Nathan Chancellor Signed-off-by: Jan Hendrik Farr Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Miguel Ojeda Reviewed-by: Thorsten Blum Link: https://lore.kernel.org/r/20241029140036.577804-2-kernel@jfarr.cc Signed-off-by: Kees Cook --- include/linux/compiler_attributes.h | 13 ------------- include/linux/compiler_types.h | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 32284cd26d52..c16d4199bf92 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -94,19 +94,6 @@ # define __copy(symbol) #endif -/* - * Optional: only supported since gcc >= 15 - * Optional: only supported since clang >= 18 - * - * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 - * clang: https://github.com/llvm/llvm-project/pull/76348 - */ -#if __has_attribute(__counted_by__) -# define __counted_by(member) __attribute__((__counted_by__(member))) -#else -# define __counted_by(member) -#endif - /* * Optional: not supported by gcc * Optional: only supported since clang >= 14.0 diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1a957ea2f4fe..639be0f30b45 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -323,6 +323,25 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* + * Optional: only supported since gcc >= 15 + * Optional: only supported since clang >= 18 + * + * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 + * clang: https://github.com/llvm/llvm-project/pull/76348 + * + * __bdos on clang < 19.1.2 can erroneously return 0: + * https://github.com/llvm/llvm-project/pull/110497 + * + * __bdos on clang < 19.1.3 can be off by 4: + * https://github.com/llvm/llvm-project/pull/112636 + */ +#ifdef CONFIG_CC_HAS_COUNTED_BY +# define __counted_by(member) __attribute__((__counted_by__(member))) +#else +# define __counted_by(member) +#endif + /* * Apply __counted_by() when the Endianness matches to increase test coverage. */ -- cgit v1.2.3 From d7f36dc446e894e0f57b5f05c5628f03c5f9e2d2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 18 Nov 2024 10:50:15 +0000 Subject: block: Support atomic writes limits for stacked devices Allow stacked devices to support atomic writes by aggregating the minimum capability of all bottom devices. Flag BLK_FEAT_ATOMIC_WRITES_STACKED is set for stacked devices which have been enabled to support atomic writes. Some things to note on the implementation: - For simplicity, all bottom devices must have same atomic write boundary value (if any) - The atomic write boundary must be a power-of-2 already, but this restriction could be relaxed. Furthermore, it is now required that the chunk sectors for a top device must be aligned with this boundary. - If a bottom device atomic write unit min/max are not aligned with the top device chunk sectors, the top device atomic write unit min/max are reduced to a value which works for the chunk sectors. Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241118105018.1870052-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 195db38fda16..31867de88213 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -333,6 +333,10 @@ typedef unsigned int __bitwise blk_features_t; #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) +/* stacked device can/does support atomic writes */ +#define BLK_FEAT_ATOMIC_WRITES_STACKED \ + ((__force blk_features_t)(1u << 16)) + /* * Flags automatically inherited when stacking limits. */ -- cgit v1.2.3 From 9cf9f2e70bea4e66a2c8b8c4743489beb21258a8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 15 Nov 2024 22:00:25 +0100 Subject: cpuidle: Change :enter_dead() driver callback return type to void After a previous change, cpuidle_play_dead(), which is the only caller of idle state :enter_dead() callbacks, ignores their return values, so they may as well be void. Suggested-by: Peter Zijlstra Signed-off-by: Rafael J. Wysocki Reviewed-by: Gautham R. Shenoy Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/2285569.iZASKD2KPV@rjwysocki.net --- include/linux/cpuidle.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 3183aeb7f5b4..a9ee4fe55dcf 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -61,7 +61,7 @@ struct cpuidle_state { struct cpuidle_driver *drv, int index); - int (*enter_dead) (struct cpuidle_device *dev, int index); + void (*enter_dead) (struct cpuidle_device *dev, int index); /* * CPUs execute ->enter_s2idle with the local tick or entire timekeeping -- cgit v1.2.3 From 5a9d1b83e5334915c651604648c20a9fc64d47a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:18 +0100 Subject: block: return unsigned int from bdev_io_opt The underlying limit is defined as an unsigned int, so return that from bdev_io_opt as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 31867de88213..97c89c0c1398 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1275,7 +1275,7 @@ static inline unsigned int queue_io_opt(const struct request_queue *q) return q->limits.io_opt; } -static inline int bdev_io_opt(struct block_device *bdev) +static inline unsigned int bdev_io_opt(struct block_device *bdev) { return queue_io_opt(bdev_get_queue(bdev)); } -- cgit v1.2.3 From ed5db174cf39374215934f21b04639a7a1513023 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:19 +0100 Subject: block: return unsigned int from queue_dma_alignment The underlying limit is defined as an unsigned int, so return that from queue_dma_alignment as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 97c89c0c1398..d8de261c2b99 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1421,7 +1421,7 @@ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) return is_seq; } -static inline int queue_dma_alignment(const struct request_queue *q) +static inline unsigned int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; } -- cgit v1.2.3 From e769489a54401d0c89555f7ad8672038b5c2b767 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:20 +0100 Subject: block: return unsigned int from blk_lim_dma_alignment_and_pad The underlying limits are defined as unsigned int, so return that from blk_lim_dma_alignment_and_pad as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d8de261c2b99..d0d8190429c8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1462,7 +1462,8 @@ static inline bool bdev_iter_is_aligned(struct block_device *bdev, bdev_logical_block_size(bdev) - 1); } -static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) +static inline unsigned int +blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { return lim->dma_alignment | lim->dma_pad_mask; } -- cgit v1.2.3 From da77d9b23700708d0d22a4407d32a8755a3596e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:21 +0100 Subject: block: return bool from blk_rq_aligned blk_rq_aligned returns a boolean condition, don't mascquerade it as int. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d0d8190429c8..5270117d54ac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1468,7 +1468,7 @@ blk_lim_dma_alignment_and_pad(struct queue_limits *lim) return lim->dma_alignment | lim->dma_pad_mask; } -static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, +static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); -- cgit v1.2.3 From e888810bc4f471f85989a0991aff28d2ac9f783b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:22 +0100 Subject: block: remove a duplicate definition for bdev_read_only bdev_read_only is already defined as an inline function in blkdev.h. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5270117d54ac..8b4e4692e7fb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1586,7 +1586,6 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } -int bdev_read_only(struct block_device *bdev); int set_blocksize(struct file *file, int size); int lookup_bdev(const char *pathname, dev_t *dev); -- cgit v1.2.3 From 766a71ef65bb217ed8bf1c068ac14c7d3c15d487 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:23 +0100 Subject: block: return bool from get_disk_ro and bdev_read_only get_disk_ro and bdev_read_only return boolean conditions, don't masquerade them as int. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8b4e4692e7fb..08a727b40816 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -779,13 +779,13 @@ static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag) atomic_andnot(flag, &bdev->__bd_flags); } -static inline int get_disk_ro(struct gendisk *disk) +static inline bool get_disk_ro(struct gendisk *disk) { return bdev_test_flag(disk->part0, BD_READ_ONLY) || test_bit(GD_READ_ONLY, &disk->state); } -static inline int bdev_read_only(struct block_device *bdev) +static inline bool bdev_read_only(struct block_device *bdev) { return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); } -- cgit v1.2.3 From f46b9cdb22f7a167c36b6bcddaef7e8aee2598fa Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 20 Nov 2024 14:14:52 -0800 Subject: io_uring: limit local tw done Instead of eagerly running all available local tw, limit the amount of local tw done to the max of IO_LOCAL_TW_DEFAULT_MAX (20) or wait_nr. The value of 20 is chosen as a reasonable heuristic to allow enough work batching but also keep latency down. Add a retry_llist that maintains a list of local tw that couldn't be done in time. No synchronisation is needed since it is only modified within the task context. Signed-off-by: David Wei Link: https://lore.kernel.org/r/20241120221452.3762588-3-dw@davidwei.uk Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index aa5f5ea98076..3e934feb3187 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -335,6 +335,7 @@ struct io_ring_ctx { */ struct { struct llist_head work_llist; + struct llist_head retry_llist; unsigned long check_cq; atomic_t cq_wait_nr; atomic_t cq_timeouts; -- cgit v1.2.3 From b88cbaaa6fa109c2eb455d8fe2a318de0d197ea2 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 15 Nov 2024 15:44:27 -0600 Subject: PCI/pwrctrl: Rename pwrctl files to pwrctrl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To slightly reduce confusion between "pwrctl" (the power controller and power sequencing framework) and "bwctrl" (the bandwidth controller), rename "pwrctl" to "pwrctrl" so they use the same "ctrl" suffix. Rename drivers/pci/pwrctl/ to drivers/pci/pwrctrl/, including the related MAINTAINERS, include file (include/linux/pci-pwrctl.h), Makefile, and Kconfig changes. This is the minimal rename of files only. A subsequent commit will rename functions and data structures. Link: https://lore.kernel.org/r/20241115214428.2061153-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Signed-off-by: Krzysztof Wilczyński Acked-by: Krzysztof Wilczyński --- include/linux/pci-pwrctl.h | 54 --------------------------------------------- include/linux/pci-pwrctrl.h | 54 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 54 deletions(-) delete mode 100644 include/linux/pci-pwrctl.h create mode 100644 include/linux/pci-pwrctrl.h (limited to 'include/linux') diff --git a/include/linux/pci-pwrctl.h b/include/linux/pci-pwrctl.h deleted file mode 100644 index 0d23dddf59ec..000000000000 --- a/include/linux/pci-pwrctl.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2024 Linaro Ltd. - */ - -#ifndef __PCI_PWRCTL_H__ -#define __PCI_PWRCTL_H__ - -#include -#include - -struct device; -struct device_link; - -/* - * This is a simple framework for solving the issue of PCI devices that require - * certain resources (regulators, GPIOs, clocks) to be enabled before the - * device can actually be detected on the PCI bus. - * - * The idea is to reuse the platform bus to populate OF nodes describing the - * PCI device and its resources, let these platform devices probe and enable - * relevant resources and then trigger a rescan of the PCI bus allowing for the - * same device (with a second associated struct device) to be registered with - * the PCI subsystem. - * - * To preserve a correct hierarchy for PCI power management and device reset, - * we create a device link between the power control platform device (parent) - * and the supplied PCI device (child). - */ - -/** - * struct pci_pwrctl - PCI device power control context. - * @dev: Address of the power controlling device. - * - * An object of this type must be allocated by the PCI power control device and - * passed to the pwrctl subsystem to trigger a bus rescan and setup a device - * link with the device once it's up. - */ -struct pci_pwrctl { - struct device *dev; - - /* Private: don't use. */ - struct notifier_block nb; - struct device_link *link; - struct work_struct work; -}; - -void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev); -int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl); -void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl); -int devm_pci_pwrctl_device_set_ready(struct device *dev, - struct pci_pwrctl *pwrctl); - -#endif /* __PCI_PWRCTL_H__ */ diff --git a/include/linux/pci-pwrctrl.h b/include/linux/pci-pwrctrl.h new file mode 100644 index 000000000000..0d23dddf59ec --- /dev/null +++ b/include/linux/pci-pwrctrl.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 Linaro Ltd. + */ + +#ifndef __PCI_PWRCTL_H__ +#define __PCI_PWRCTL_H__ + +#include +#include + +struct device; +struct device_link; + +/* + * This is a simple framework for solving the issue of PCI devices that require + * certain resources (regulators, GPIOs, clocks) to be enabled before the + * device can actually be detected on the PCI bus. + * + * The idea is to reuse the platform bus to populate OF nodes describing the + * PCI device and its resources, let these platform devices probe and enable + * relevant resources and then trigger a rescan of the PCI bus allowing for the + * same device (with a second associated struct device) to be registered with + * the PCI subsystem. + * + * To preserve a correct hierarchy for PCI power management and device reset, + * we create a device link between the power control platform device (parent) + * and the supplied PCI device (child). + */ + +/** + * struct pci_pwrctl - PCI device power control context. + * @dev: Address of the power controlling device. + * + * An object of this type must be allocated by the PCI power control device and + * passed to the pwrctl subsystem to trigger a bus rescan and setup a device + * link with the device once it's up. + */ +struct pci_pwrctl { + struct device *dev; + + /* Private: don't use. */ + struct notifier_block nb; + struct device_link *link; + struct work_struct work; +}; + +void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev); +int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl); +void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl); +int devm_pci_pwrctl_device_set_ready(struct device *dev, + struct pci_pwrctl *pwrctl); + +#endif /* __PCI_PWRCTL_H__ */ -- cgit v1.2.3 From 3f925cd6287401bbc9d568f56d796a69c8bd292a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 15 Nov 2024 15:44:28 -0600 Subject: PCI/pwrctrl: Rename pwrctrl functions and structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename pwrctrl functions and structures from "pwrctl" to "pwrctrl" to match the similar file renames. Link: https://lore.kernel.org/r/20241115214428.2061153-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Signed-off-by: Krzysztof Wilczyński Acked-by: Krzysztof Wilczyński --- include/linux/pci-pwrctrl.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-pwrctrl.h b/include/linux/pci-pwrctrl.h index 0d23dddf59ec..7d439b0675e9 100644 --- a/include/linux/pci-pwrctrl.h +++ b/include/linux/pci-pwrctrl.h @@ -3,8 +3,8 @@ * Copyright (C) 2024 Linaro Ltd. */ -#ifndef __PCI_PWRCTL_H__ -#define __PCI_PWRCTL_H__ +#ifndef __PCI_PWRCTRL_H__ +#define __PCI_PWRCTRL_H__ #include #include @@ -29,14 +29,14 @@ struct device_link; */ /** - * struct pci_pwrctl - PCI device power control context. + * struct pci_pwrctrl - PCI device power control context. * @dev: Address of the power controlling device. * * An object of this type must be allocated by the PCI power control device and - * passed to the pwrctl subsystem to trigger a bus rescan and setup a device + * passed to the pwrctrl subsystem to trigger a bus rescan and setup a device * link with the device once it's up. */ -struct pci_pwrctl { +struct pci_pwrctrl { struct device *dev; /* Private: don't use. */ @@ -45,10 +45,10 @@ struct pci_pwrctl { struct work_struct work; }; -void pci_pwrctl_init(struct pci_pwrctl *pwrctl, struct device *dev); -int pci_pwrctl_device_set_ready(struct pci_pwrctl *pwrctl); -void pci_pwrctl_device_unset_ready(struct pci_pwrctl *pwrctl); -int devm_pci_pwrctl_device_set_ready(struct device *dev, - struct pci_pwrctl *pwrctl); +void pci_pwrctrl_init(struct pci_pwrctrl *pwrctrl, struct device *dev); +int pci_pwrctrl_device_set_ready(struct pci_pwrctrl *pwrctrl); +void pci_pwrctrl_device_unset_ready(struct pci_pwrctrl *pwrctrl); +int devm_pci_pwrctrl_device_set_ready(struct device *dev, + struct pci_pwrctrl *pwrctrl); -#endif /* __PCI_PWRCTL_H__ */ +#endif /* __PCI_PWRCTRL_H__ */ -- cgit v1.2.3 From 3273d8ad947dea925a65a78ca29e5351c960c801 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 Nov 2024 09:25:54 +0800 Subject: f2fs: fix to do cast in F2FS_{BLK_TO_BYTES, BTYES_TO_BLK} to avoid overflow It missed to cast variable to unsigned long long type before bit shift, which will cause overflow, fix it. Fixes: f7ef9b83b583 ("f2fs: introduce macros to convert bytes and blocks in f2fs") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index b0b821edfd97..3b2ad444c002 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -24,10 +24,10 @@ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ #define COMPRESS_ADDR ((block_t)-2) /* used as compressed data flag */ -#define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS) -#define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS) +#define F2FS_BYTES_TO_BLK(bytes) ((unsigned long long)(bytes) >> F2FS_BLKSIZE_BITS) +#define F2FS_BLK_TO_BYTES(blk) ((unsigned long long)(blk) << F2FS_BLKSIZE_BITS) #define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) -#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1)) +#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1)) /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 -- cgit v1.2.3 From 77569f785c8624fa4189795fb52e635a973672e5 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Fri, 8 Nov 2024 09:25:56 +0800 Subject: f2fs: fix to adjust appropriate length for fiemap If user give a file size as "length" parameter for fiemap operations, but if this size is non-block size aligned, it will show 2 segments fiemap results even this whole file is contiguous on disk, such as the following results: ./f2fs_io fiemap 0 19034 ylog/analyzer.py Fiemap: offset = 0 len = 19034 logical addr. physical addr. length flags 0 0000000000000000 0000000020baa000 0000000000004000 00001000 1 0000000000004000 0000000020bae000 0000000000001000 00001001 after this patch: ./f2fs_io fiemap 0 19034 ylog/analyzer.py Fiemap: offset = 0 len = 19034 logical addr. physical addr. length flags 0 0000000000000000 00000000315f3000 0000000000005000 00001001 Signed-off-by: Zhiguo Niu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3b2ad444c002..c24f8bc01045 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -24,6 +24,7 @@ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ #define COMPRESS_ADDR ((block_t)-2) /* used as compressed data flag */ +#define F2FS_BLKSIZE_MASK (F2FS_BLKSIZE - 1) #define F2FS_BYTES_TO_BLK(bytes) ((unsigned long long)(bytes) >> F2FS_BLKSIZE_BITS) #define F2FS_BLK_TO_BYTES(blk) ((unsigned long long)(blk) << F2FS_BLKSIZE_BITS) #define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) -- cgit v1.2.3 From 7d2f9f870f26798699585f96fa7a2a2cab38dc02 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 20 Nov 2024 15:10:07 +0800 Subject: nvme: introduce change ptpl and iekey definition This is for the next tuning pr code more readble patch, make linux/nvme.h's changes separately. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2baf9a80b470..13377dde4527 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2171,4 +2171,13 @@ enum nvme_pr_release_action { NVME_PR_RELEASE_ACT_CLEAR = 1, }; +enum nvme_pr_change_ptpl { + NVME_PR_CPTPL_NO_CHANGE = 0, + NVME_PR_CPTPL_RESV = 1 << 30, + NVME_PR_CPTPL_CLEARED = 2 << 30, + NVME_PR_CPTPL_PERSIST = 3 << 30, +}; + +#define NVME_PR_IGNORE_KEY (1 << 3) + #endif /* _LINUX_NVME_H */ -- cgit v1.2.3 From 64214c2b95364d26cdff045d8bbefd37380edbe1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 14 Nov 2024 15:55:30 -0400 Subject: iommu: Add ops->domain_alloc_nested() It turns out all the drivers that are using this immediately call into another function, so just make that function directly into the op. This makes paging=NULL for domain_alloc_user and we can remove the argument in the next patch. The function mirrors the similar op in the viommu that allocates a nested domain on top of the viommu's nesting parent. This version supports cases where a viommu is not being used. Link: https://patch.msgid.link/r/1-v1-c252ebdeb57b+329-iommu_paging_flags_jgg@nvidia.com Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d6aaaec3caf4..0472cc124519 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -559,15 +559,13 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * the caller iommu_domain_alloc() returns. * @domain_alloc_user: Allocate an iommu domain corresponding to the input * parameters as defined in include/uapi/linux/iommufd.h. - * Upon success, if the @user_data is valid and the @parent - * points to a kernel-managed domain, the new domain must be - * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be - * NULL while the @user_data can be optionally provided, the + * The @user_data can be optionally provided, the * new domain must support __IOMMU_DOMAIN_PAGING. * Upon failure, ERR_PTR must be returned. * @domain_alloc_paging: Allocate an iommu_domain that can be used for * UNMANAGED, DMA, and DMA_FQ domain types. * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing. + * @domain_alloc_nested: Allocate an iommu_domain for nested translation. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -622,6 +620,9 @@ struct iommu_ops { struct iommu_domain *(*domain_alloc_paging)(struct device *dev); struct iommu_domain *(*domain_alloc_sva)(struct device *dev, struct mm_struct *mm); + struct iommu_domain *(*domain_alloc_nested)( + struct device *dev, struct iommu_domain *parent, u32 flags, + const struct iommu_user_data *user_data); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); -- cgit v1.2.3 From d53764723ecd639a0cc0c5ad24146847fc09f78d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 14 Nov 2024 15:55:31 -0400 Subject: iommu: Rename ops->domain_alloc_user() to domain_alloc_paging_flags() Now that the main domain allocating path is calling this function it doesn't make sense to leave it named _user. Change the name to alloc_paging_flags() to mirror the new iommu_paging_domain_alloc_flags() function. A driver should implement only one of ops->domain_alloc_paging() or ops->domain_alloc_paging_flags(). The former is a simpler interface with less boiler plate that the majority of drivers use. The latter is for drivers with a greater feature set (PASID, multiple page table support, advanced iommufd support, nesting, etc). Additional patches will be needed to achieve this. Link: https://patch.msgid.link/r/2-v1-c252ebdeb57b+329-iommu_paging_flags_jgg@nvidia.com Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0472cc124519..1e3308e89996 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -557,13 +557,17 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * @domain_alloc: allocate and return an iommu domain if success. Otherwise * NULL is returned. The domain is not fully initialized until * the caller iommu_domain_alloc() returns. - * @domain_alloc_user: Allocate an iommu domain corresponding to the input - * parameters as defined in include/uapi/linux/iommufd.h. - * The @user_data can be optionally provided, the - * new domain must support __IOMMU_DOMAIN_PAGING. - * Upon failure, ERR_PTR must be returned. + * @domain_alloc_paging_flags: Allocate an iommu domain corresponding to the + * input parameters as defined in + * include/uapi/linux/iommufd.h. The @user_data can be + * optionally provided, the new domain must support + * __IOMMU_DOMAIN_PAGING. Upon failure, ERR_PTR must be + * returned. * @domain_alloc_paging: Allocate an iommu_domain that can be used for - * UNMANAGED, DMA, and DMA_FQ domain types. + * UNMANAGED, DMA, and DMA_FQ domain types. This is the + * same as invoking domain_alloc_paging_flags() with + * @flags=0, @user_data=NULL. A driver should implement + * only one of the two ops. * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing. * @domain_alloc_nested: Allocate an iommu_domain for nested translation. * @probe_device: Add device to iommu driver handling @@ -614,8 +618,8 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); - struct iommu_domain *(*domain_alloc_user)( - struct device *dev, u32 flags, struct iommu_domain *parent, + struct iommu_domain *(*domain_alloc_paging_flags)( + struct device *dev, u32 flags, const struct iommu_user_data *user_data); struct iommu_domain *(*domain_alloc_paging)(struct device *dev); struct iommu_domain *(*domain_alloc_sva)(struct device *dev, -- cgit v1.2.3 From 9d8a2b033db179bef9b6b5bad492f611a0fe89b7 Mon Sep 17 00:00:00 2001 From: Miao Wang Date: Thu, 21 Nov 2024 22:25:21 +0800 Subject: ACPI: introduce acpi_arch_init() To avoid arch-specific code in general ACPI initialization flow, introduce a weak symbol acpi_arch_init(). Currently, arm64 and riscv can utillize this to insert their arch-specific flow. In the future, other architectures can also have a chance to define their own arch-specific ACPI initialization process if necessary. Reviewed-by: Sunil V L Reviewed-by: Sudeep Holla Acked-by: Hanjun Guo Signed-off-by: Miao Wang Link: https://patch.msgid.link/20241121-intro-acpi-arch-init-v4-1-b1fb517e7d8b@gmail.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 7dd24acd9ffe..05f39fbfa485 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1530,17 +1530,7 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) } #endif -#ifdef CONFIG_ARM64 -void acpi_arm_init(void); -#else -static inline void acpi_arm_init(void) { } -#endif - -#ifdef CONFIG_RISCV -void acpi_riscv_init(void); -#else -static inline void acpi_riscv_init(void) { } -#endif +void acpi_arch_init(void); #ifdef CONFIG_ACPI_PCC void acpi_init_pcc(void); -- cgit v1.2.3 From 0172afefbfbdd8987787c926b40b68400bd1c3d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Nov 2024 21:28:49 +0100 Subject: tracing: Record task flag NEED_RESCHED_LAZY. The scheduler added NEED_RESCHED_LAZY scheduling. Record this state as part of trace flags and expose it in the need_resched field. Record and expose NEED_RESCHED_LAZY. [bigeasy: Commit description, documentation bits.] Cc: Peter Zijlstra Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241122202849.7DfYpJR0@linutronix.de Reviewed-by: Ankur Arora Reviewed-by: Steven Rostedt (Google) Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 016b29a56c87..2a5df5b62cfc 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -184,6 +184,7 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED_LAZY = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, -- cgit v1.2.3 From 306d40aa53b671ea72c3bf272f85ccb6c1b0bc65 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 23 Nov 2024 10:30:27 -0500 Subject: tracing: Move it_func[0] comment to the relevant context When introducing __DO_TRACE_CALL(), the iteration over it_func moved from __DO_TRACE() to __tracepoint_iter_##_name(), but the comment relevant for this iterator was left in its original location. Move the comment to the relevant context. Fixes: d25e37d89dd2 ("tracepoint: Optimize using static_call()") Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Link: https://lore.kernel.org/20241123153031.2884933-2-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 425123e921ac..d390e8cabf02 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -210,9 +210,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #endif /* CONFIG_HAVE_STATIC_CALL */ /* - * it_func[0] is never NULL because there is at least one element in the array - * when the array itself is non NULL. - * * With @syscall=0, the tracepoint callback array dereference is * protected by disabling preemption. * With @syscall=1, the tracepoint callback array dereference is @@ -316,6 +313,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * We have no guarantee that gcc and the linker won't up-align the tracepoint * structures, so we create an array of pointers that will be used for iteration * on the tracepoints. + * + * it_func[0] is never NULL because there is at least one element in the array + * when the array itself is non NULL. */ #define __DEFINE_TRACE_EXT(_name, _ext, proto, args) \ static const char __tpstrtab_##_name[] \ -- cgit v1.2.3 From 89c7e17f303e2d56d50a162bb65d786d3a43963e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 23 Nov 2024 10:30:28 -0500 Subject: tracing: Remove __idx variable from __DO_TRACE Since the removal of SRCU-protected tracepoints, the __idx variable in __DO_TRACE is unused. Remove this variable. Fixes: 48bcda684823 ("tracing: Remove definition of trace_*_rcuidle()") Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Link: https://lore.kernel.org/20241123153031.2884933-3-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d390e8cabf02..867f3c1ac7dc 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -218,8 +218,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) */ #define __DO_TRACE(name, args, cond, syscall) \ do { \ - int __maybe_unused __idx = 0; \ - \ if (!(cond)) \ return; \ \ -- cgit v1.2.3 From 7c565a4d4e437034755a06b7d61361add3b98882 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 23 Nov 2024 10:30:29 -0500 Subject: rcupdate_trace: Define rcu_tasks_trace lock guard Define a rcu_tasks_trace lock guard for use by the syscall enter/exit tracepoints. Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241123153031.2884933-4-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/rcupdate_trace.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index eda493200663..e6c44eb428ab 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -10,6 +10,7 @@ #include #include +#include extern struct lockdep_map rcu_trace_lock_map; @@ -98,4 +99,8 @@ static inline void rcu_read_lock_trace(void) { BUG(); } static inline void rcu_read_unlock_trace(void) { BUG(); } #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +DEFINE_LOCK_GUARD_0(rcu_tasks_trace, + rcu_read_lock_trace(), + rcu_read_unlock_trace()) + #endif /* __LINUX_RCUPDATE_TRACE_H */ -- cgit v1.2.3 From 98bf0fbb65226809a8df4d1948c5b6cf0c91590f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 23 Nov 2024 10:30:30 -0500 Subject: tracing: Remove conditional locking from __DO_TRACE() Remove conditional locking by moving the __DO_TRACE() code into trace_##name(). When the faultable syscall tracepoints were implemented, __DO_TRACE() had a rcuidle argument which selected between SRCU and preempt disable. Therefore, the RCU tasks trace protection for faultable syscall tracepoints was introduced using the same pattern. At that point, it did not appear obvious that this feedback from Linus [1] applied here as well, because the __DO_TRACE() modification was extending a pre-existing pattern. Shortly before pulling the faultable syscall tracepoints modifications, Steven removed the rcuidle argument and SRCU protection scheme entirely from tracepoint.h: commit 48bcda684823 ("tracing: Remove definition of trace_*_rcuidle()") This required a rebase of the faultable syscall tracepoints series, which missed a perfect opportunity to integrate the prior recommendation from Linus. In response to the pull request, Linus pointed out [2] that he was not pleased by the implementation, expecting this to be fixed in a follow up patch series. Move __DO_TRACE() code into trace_##name() within each of __DECLARE_TRACE() and __DECLARE_TRACE_SYSCALL(). Use a scoped guard to guard the preempt disable notrace and RCU tasks trace critical sections. Link: https://lore.kernel.org/all/CAHk-=wggDLDeTKbhb5hh--x=-DQd69v41137M72m6NOTmbD-cw@mail.gmail.com/ [1] Link: https://lore.kernel.org/lkml/CAHk-=witPrLcu22dZ93VCyRQonS7+-dFYhQbna=KBa-TAhayMw@mail.gmail.com/ [2] Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults") Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241123153031.2884933-5-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 45 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 867f3c1ac7dc..832f49b56b1f 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -209,31 +209,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DO_TRACE_CALL(name, args) __traceiter_##name(NULL, args) #endif /* CONFIG_HAVE_STATIC_CALL */ -/* - * With @syscall=0, the tracepoint callback array dereference is - * protected by disabling preemption. - * With @syscall=1, the tracepoint callback array dereference is - * protected by Tasks Trace RCU, which allows probes to handle page - * faults. - */ -#define __DO_TRACE(name, args, cond, syscall) \ - do { \ - if (!(cond)) \ - return; \ - \ - if (syscall) \ - rcu_read_lock_trace(); \ - else \ - preempt_disable_notrace(); \ - \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - \ - if (syscall) \ - rcu_read_unlock_trace(); \ - else \ - preempt_enable_notrace(); \ - } while (0) - /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -282,10 +257,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ static inline void trace_##name(proto) \ { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond), 0); \ + if (static_branch_unlikely(&__tracepoint_##name.key)) { \ + if (cond) { \ + scoped_guard(preempt_notrace) \ + __DO_TRACE_CALL(name, TP_ARGS(args)); \ + } \ + } \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ @@ -297,10 +274,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline void trace_##name(proto) \ { \ might_fault(); \ - if (static_branch_unlikely(&__tracepoint_##name.key)) \ - __DO_TRACE(name, \ - TP_ARGS(args), \ - TP_CONDITION(cond), 1); \ + if (static_branch_unlikely(&__tracepoint_##name.key)) { \ + if (cond) { \ + scoped_guard(rcu_tasks_trace) \ + __DO_TRACE_CALL(name, TP_ARGS(args)); \ + } \ + } \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ -- cgit v1.2.3 From ef0d4186083127d2f99ed04e051fd94ba061d253 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 23 Nov 2024 10:30:31 -0500 Subject: tracing: Remove cond argument from __DECLARE_TRACE_SYSCALL Syscall tracepoints do not require a "cond" argument, because they are meant to be used only for sys_enter and sys_exit instrumentation, which don't require condition evaluation. Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Cc: linux-trace-kernel@vger.kernel.org Link: https://lore.kernel.org/20241123153031.2884933-6-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 832f49b56b1f..b2633a72e871 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -220,7 +220,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * site if it is not watching, as it will need to be active when the * tracepoint is enabled. */ -#define __DECLARE_TRACE_COMMON(name, proto, args, cond, data_proto) \ +#define __DECLARE_TRACE_COMMON(name, proto, args, data_proto) \ extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ @@ -254,7 +254,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ - __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void trace_##name(proto) \ { \ if (static_branch_unlikely(&__tracepoint_##name.key)) { \ @@ -269,18 +269,16 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ } -#define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto) \ - __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \ +#define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void trace_##name(proto) \ { \ might_fault(); \ if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - if (cond) { \ - scoped_guard(rcu_tasks_trace) \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ + scoped_guard(rcu_tasks_trace) \ + __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ - if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ + if (IS_ENABLED(CONFIG_LOCKDEP)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ } \ @@ -363,7 +361,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #else /* !TRACEPOINTS_ENABLED */ -#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ +#define __DECLARE_TRACE_COMMON(name, proto, args, data_proto) \ static inline void trace_##name(proto) \ { } \ static inline int \ @@ -387,7 +385,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) return false; \ } -#define __DECLARE_TRACE_SYSCALL __DECLARE_TRACE +#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) + +#define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ + __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) #define DEFINE_TRACE_FN(name, reg, unreg, proto, args) #define DEFINE_TRACE_SYSCALL(name, reg, unreg, proto, args) @@ -453,7 +455,6 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define DECLARE_TRACE_SYSCALL(name, proto, args) \ __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \ - cpu_online(raw_smp_processor_id()), \ PARAMS(void *__data, proto)) #define TRACE_EVENT_FLAGS(event, flag) -- cgit v1.2.3 From 2bd9b57d04df417f31ef54448477c212fcdd14fc Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 25 Nov 2024 09:25:14 -0500 Subject: tracing: Use guard() rather than scoped_guard() Using scoped_guard() in the implementation of trace_##name() adds an unnecessary level of indentation. Cc: Steven Rostedt Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Michael Jeanson Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Yonghong Song Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Namhyung Kim Cc: Andrii Nakryiko Cc: bpf@vger.kernel.org Cc: Joel Fernandes Cc: Jordan Rife Link: https://lore.kernel.org/20241125142514.2897143-1-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index b2633a72e871..e398f6e43f61 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -259,8 +259,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) { \ if (static_branch_unlikely(&__tracepoint_##name.key)) { \ if (cond) { \ - scoped_guard(preempt_notrace) \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ + guard(preempt_notrace)(); \ + __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ } \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ @@ -275,8 +275,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) { \ might_fault(); \ if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - scoped_guard(rcu_tasks_trace) \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ + guard(rcu_tasks_trace)(); \ + __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ if (IS_ENABLED(CONFIG_LOCKDEP)) { \ WARN_ONCE(!rcu_is_watching(), \ -- cgit v1.2.3 From 222974c6ec9d901f7ad13bbe6e505ec1f1d822d4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 25 Nov 2024 18:45:33 -0800 Subject: iommu: remove stale declaration left over by a merge conflict The merge commit ae3325f752ef ("Merge branches 'arm/smmu', 'mediatek', 's390', 'ti/omap', 'riscv' and 'core' into next") left a stale declaration of 'iommu_present()' even though the 'core' branch that was merged had removed the function (and the declaration). Remove it for real. Reported-by: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Joerg Roedel Signed-off-by: Linus Torvalds --- include/linux/iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d6aaaec3caf4..301e97d745c1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -841,7 +841,6 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) }; } -extern bool iommu_present(const struct bus_type *bus); extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap); extern bool iommu_group_has_isolated_msi(struct iommu_group *group); struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, unsigned int flags); -- cgit v1.2.3 From bb43a59944f45e89aa158740b8a16ba8f0b0fa2b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Nov 2024 01:14:40 +0900 Subject: Rename .data.unlikely to .data..unlikely Commit 7ccaba5314ca ("consolidate WARN_...ONCE() static variables") was intended to collect all .data.unlikely sections into one chunk. However, this has not worked when CONFIG_LD_DEAD_CODE_DATA_ELIMINATION or CONFIG_LTO_CLANG is enabled, because .data.unlikely matches the .data.[0-9a-zA-Z_]* pattern in the DATA_MAIN macro. Commit cb87481ee89d ("kbuild: linker script do not match C names unless LD_DEAD_CODE_DATA_ELIMINATION is configured") was introduced to suppress the issue for the default CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n case, providing a minimal fix for stable backporting. We were aware this did not address the issue for CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y. The plan was to apply correct fixes and then revert cb87481ee89d. [1] Seven years have passed since then, yet the #ifdef workaround remains in place. Using a ".." separator in the section name fixes the issue for CONFIG_LD_DEAD_CODE_DATA_ELIMINATION and CONFIG_LTO_CLANG. [1]: https://lore.kernel.org/linux-kbuild/CAK7LNASck6BfdLnESxXUeECYL26yUDm0cwRZuM4gmaWUkxjL5g@mail.gmail.com/ Fixes: cb87481ee89d ("kbuild: linker script do not match C names unless LD_DEAD_CODE_DATA_ELIMINATION is configured") Signed-off-by: Masahiro Yamada --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 58d84c59f3dd..48e5c03df1dd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -401,7 +401,7 @@ static inline int debug_lockdep_rcu_enabled(void) */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ - static bool __section(".data.unlikely") __warned; \ + static bool __section(".data..unlikely") __warned; \ if (debug_lockdep_rcu_enabled() && (c) && \ debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ -- cgit v1.2.3 From dbefa1f31a91670c9e7dac9b559625336206466f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Nov 2024 01:14:41 +0900 Subject: Rename .data.once to .data..once to fix resetting WARN*_ONCE Commit b1fca27d384e ("kernel debug: support resetting WARN*_ONCE") added support for clearing the state of once warnings. However, it is not functional when CONFIG_LD_DEAD_CODE_DATA_ELIMINATION or CONFIG_LTO_CLANG is enabled, because .data.once matches the .data.[0-9a-zA-Z_]* pattern in the DATA_MAIN macro. Commit cb87481ee89d ("kbuild: linker script do not match C names unless LD_DEAD_CODE_DATA_ELIMINATION is configured") was introduced to suppress the issue for the default CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n case, providing a minimal fix for stable backporting. We were aware this did not address the issue for CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y. The plan was to apply correct fixes and then revert cb87481ee89d. [1] Seven years have passed since then, yet the #ifdef workaround remains in place. Meanwhile, commit b1fca27d384e introduced the .data.once section, and commit dc5723b02e52 ("kbuild: add support for Clang LTO") extended the #ifdef. Using a ".." separator in the section name fixes the issue for CONFIG_LD_DEAD_CODE_DATA_ELIMINATION and CONFIG_LTO_CLANG. [1]: https://lore.kernel.org/linux-kbuild/CAK7LNASck6BfdLnESxXUeECYL26yUDm0cwRZuM4gmaWUkxjL5g@mail.gmail.com/ Fixes: b1fca27d384e ("kernel debug: support resetting WARN*_ONCE") Fixes: dc5723b02e52 ("kbuild: add support for Clang LTO") Signed-off-by: Masahiro Yamada --- include/linux/mmdebug.h | 6 +++--- include/linux/once.h | 4 ++-- include/linux/once_lite.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 39a7714605a7..d7cb1e5ecbda 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -46,7 +46,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); } \ } while (0) #define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \ - static bool __section(".data.once") __warned; \ + static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ @@ -66,7 +66,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); unlikely(__ret_warn); \ }) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ - static bool __section(".data.once") __warned; \ + static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ @@ -77,7 +77,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); unlikely(__ret_warn_once); \ }) #define VM_WARN_ON_ONCE_MM(cond, mm) ({ \ - static bool __section(".data.once") __warned; \ + static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ diff --git a/include/linux/once.h b/include/linux/once.h index bc714d414448..30346fcdc799 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -46,7 +46,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, #define DO_ONCE(func, ...) \ ({ \ bool ___ret = false; \ - static bool __section(".data.once") ___done = false; \ + static bool __section(".data..once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ unsigned long ___flags; \ @@ -64,7 +64,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, #define DO_ONCE_SLEEPABLE(func, ...) \ ({ \ bool ___ret = false; \ - static bool __section(".data.once") ___done = false; \ + static bool __section(".data..once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ ___ret = __do_once_sleepable_start(&___done); \ diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h index b7bce4983638..27de7bc32a06 100644 --- a/include/linux/once_lite.h +++ b/include/linux/once_lite.h @@ -12,7 +12,7 @@ #define __ONCE_LITE_IF(condition) \ ({ \ - static bool __section(".data.once") __already_done; \ + static bool __section(".data..once") __already_done; \ bool __ret_cond = !!(condition); \ bool __ret_once = false; \ \ -- cgit v1.2.3 From 81de291d86b704de1809cfb06672902d003cf3a3 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 6 Nov 2024 17:33:28 +0800 Subject: of: dynamic: Add of_changeset_update_prop_string Add a helper function to add string property updates to an OF changeset. This is similar to of_changeset_add_prop_string(), but instead of adding the property (and failing if it exists), it will update the property. This shall be used later in the DT hardware prober. Signed-off-by: Chen-Yu Tsai Reviewed-by: Rob Herring (Arm) Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Wolfram Sang --- include/linux/of.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 086a60f3b8a6..d0307e3b093d 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1651,6 +1651,10 @@ static inline int of_changeset_add_prop_u32(struct of_changeset *ocs, return of_changeset_add_prop_u32_array(ocs, np, prop_name, &val, 1); } +int of_changeset_update_prop_string(struct of_changeset *ocs, + struct device_node *np, + const char *prop_name, const char *str); + int of_changeset_add_prop_bool(struct of_changeset *ocs, struct device_node *np, const char *prop_name); -- cgit v1.2.3 From 1fcc67e3a354865775355eafec1fb061a755c971 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 6 Nov 2024 17:33:29 +0800 Subject: of: base: Add for_each_child_of_node_with_prefix() There are cases where drivers would go through child device nodes and operate on only the ones whose node name starts with a given prefix. Provide a helper for these users. This will mainly be used in a subsequent patch that implements a hardware component prober for I2C busses. Signed-off-by: Chen-Yu Tsai Reviewed-by: Rob Herring (Arm) Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Wolfram Sang --- include/linux/of.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index d0307e3b093d..f921786cb8ac 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -289,6 +289,9 @@ extern struct device_node *of_get_parent(const struct device_node *node); extern struct device_node *of_get_next_parent(struct device_node *node); extern struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev); +extern struct device_node *of_get_next_child_with_prefix(const struct device_node *node, + struct device_node *prev, + const char *prefix); extern struct device_node *of_get_next_available_child( const struct device_node *node, struct device_node *prev); extern struct device_node *of_get_next_reserved_child( @@ -1468,6 +1471,12 @@ static inline int of_property_read_s32(const struct device_node *np, child != NULL; \ child = of_get_next_child(parent, child)) +#define for_each_child_of_node_with_prefix(parent, child, prefix) \ + for (struct device_node *child __free(device_node) = \ + of_get_next_child_with_prefix(parent, NULL, prefix); \ + child != NULL; \ + child = of_get_next_child_with_prefix(parent, child, prefix)) + #define for_each_available_child_of_node(parent, child) \ for (child = of_get_next_available_child(parent, NULL); child != NULL; \ child = of_get_next_available_child(parent, child)) -- cgit v1.2.3 From 157ce8f381efe264933e9366db828d845bade3a1 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 6 Nov 2024 17:33:30 +0800 Subject: i2c: Introduce OF component probe function Some devices are designed and manufactured with some components having multiple drop-in replacement options. These components are often connected to the mainboard via ribbon cables, having the same signals and pin assignments across all options. These may include the display panel and touchscreen on laptops and tablets, and the trackpad on laptops. Sometimes which component option is used in a particular device can be detected by some firmware provided identifier, other times that information is not available, and the kernel has to try to probe each device. This change attempts to make the "probe each device" case cleaner. The current approach is to have all options added and enabled in the device tree. The kernel would then bind each device and run each driver's probe function. This works, but has been broken before due to the introduction of asynchronous probing, causing multiple instances requesting "shared" resources, such as pinmuxes, GPIO pins, interrupt lines, at the same time, with only one instance succeeding. Work arounds for these include moving the pinmux to the parent I2C controller, using GPIO hogs or pinmux settings to keep the GPIO pins in some fixed configuration, and requesting the interrupt line very late. Such configurations can be seen on the MT8183 Krane Chromebook tablets, and the Qualcomm sc8280xp-based Lenovo Thinkpad 13S. Instead of this delicate dance between drivers and device tree quirks, this change introduces a simple I2C component probe function. For a given class of devices on the same I2C bus, it will go through all of them, doing a simple I2C read transfer and see which one of them responds. It will then enable the device that responds. This requires some minor modifications in the existing device tree. The status for all the device nodes for the component options must be set to "fail-needs-probe". This makes it clear that some mechanism is needed to enable one of them, and also prevents the prober and device drivers running at the same time. Signed-off-by: Chen-Yu Tsai Reviewed-by: Andy Shevchenko Reviewed-by: Douglas Anderson Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Wolfram Sang --- include/linux/i2c-of-prober.h | 75 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 include/linux/i2c-of-prober.h (limited to 'include/linux') diff --git a/include/linux/i2c-of-prober.h b/include/linux/i2c-of-prober.h new file mode 100644 index 000000000000..e7e052ac9e48 --- /dev/null +++ b/include/linux/i2c-of-prober.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Definitions for the Linux I2C OF component prober + * + * Copyright (C) 2024 Google LLC + */ + +#ifndef _LINUX_I2C_OF_PROBER_H +#define _LINUX_I2C_OF_PROBER_H + +#include + +struct device; +struct device_node; + +/** + * struct i2c_of_probe_ops - I2C OF component prober callbacks + * + * A set of callbacks to be used by i2c_of_probe_component(). + * + * All callbacks are optional. Callbacks are called only once per run, and are + * used in the order they are defined in this structure. + * + * All callbacks that have return values shall return %0 on success, + * or a negative error number on failure. + * + * The @dev parameter passed to the callbacks is the same as @dev passed to + * i2c_of_probe_component(). It should only be used for dev_printk() calls + * and nothing else, especially not managed device resource (devres) APIs. + */ +struct i2c_of_probe_ops { + /** + * @enable: Retrieve and enable resources so that the components respond to probes. + * + * It is OK for this callback to return -EPROBE_DEFER since the intended use includes + * retrieving resources and enables them. Resources should be reverted to their initial + * state and released before returning if this fails. + */ + int (*enable)(struct device *dev, struct device_node *bus_node, void *data); + + /** + * @cleanup_early: Release exclusive resources prior to calling probe() on a + * detected component. + * + * Only called if a matching component is actually found. If none are found, + * resources that would have been released in this callback should be released in + * @free_resourcs_late instead. + */ + void (*cleanup_early)(struct device *dev, void *data); + + /** + * @cleanup: Opposite of @enable to balance refcounts and free resources after probing. + * + * Should check if resources were already freed by @cleanup_early. + */ + void (*cleanup)(struct device *dev, void *data); +}; + +/** + * struct i2c_of_probe_cfg - I2C OF component prober configuration + * @ops: Callbacks for the prober to use. + * @type: A string to match the device node name prefix to probe for. + */ +struct i2c_of_probe_cfg { + const struct i2c_of_probe_ops *ops; + const char *type; +}; + +#if IS_ENABLED(CONFIG_OF_DYNAMIC) + +int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cfg, void *ctx); + +#endif /* IS_ENABLED(CONFIG_OF_DYNAMIC) */ + +#endif /* _LINUX_I2C_OF_PROBER_H */ -- cgit v1.2.3 From 897261149d255d03fc90bec6782e3835cacbfdde Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 6 Nov 2024 17:33:31 +0800 Subject: i2c: of-prober: Add simple helpers for regulator support Add helpers to do regulator management for the I2C OF component prober. Components that the prober intends to probe likely require their regulator supplies be enabled, and GPIOs be toggled to enable them or bring them out of reset before they will respond to probe attempts. GPIOs will be handled in the next patch. The assumption is that the same class of components to be probed are always connected in the same fashion with the same regulator supply and GPIO. The names may vary due to binding differences, but the physical layout does not change. This set of helpers supports at most one regulator supply. The user must specify the node from which the supply is retrieved. The supply name and the amount of time to wait after the supply is enabled are also given by the user. Signed-off-by: Chen-Yu Tsai Reviewed-by: Douglas Anderson Reviewed-by: Andy Shevchenko Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Wolfram Sang --- include/linux/i2c-of-prober.h | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c-of-prober.h b/include/linux/i2c-of-prober.h index e7e052ac9e48..df95aa6ad90e 100644 --- a/include/linux/i2c-of-prober.h +++ b/include/linux/i2c-of-prober.h @@ -70,6 +70,50 @@ struct i2c_of_probe_cfg { int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cfg, void *ctx); +/** + * DOC: I2C OF component prober simple helpers + * + * Components such as trackpads are commonly connected to a devices baseboard + * with a 6-pin ribbon cable. That gives at most one voltage supply and one + * GPIO (commonly a "enable" or "reset" line) besides the I2C bus, interrupt + * pin, and common ground. Touchscreens, while integrated into the display + * panel's connection, typically have the same set of connections. + * + * A simple set of helpers are provided here for use with the I2C OF component + * prober. This implementation targets such components, allowing for at most + * one regulator supply. + * + * The following helpers are provided: + * * i2c_of_probe_simple_enable() + * * i2c_of_probe_simple_cleanup() + */ + +/** + * struct i2c_of_probe_simple_opts - Options for simple I2C component prober callbacks + * @res_node_compatible: Compatible string of device node to retrieve resources from. + * @supply_name: Name of regulator supply. + * @post_power_on_delay_ms: Delay after regulators are powered on. Passed to msleep(). + */ +struct i2c_of_probe_simple_opts { + const char *res_node_compatible; + const char *supply_name; + unsigned int post_power_on_delay_ms; +}; + +struct regulator; + +struct i2c_of_probe_simple_ctx { + /* public: provided by user before helpers are used. */ + const struct i2c_of_probe_simple_opts *opts; + /* private: internal fields for helpers. */ + struct regulator *supply; +}; + +int i2c_of_probe_simple_enable(struct device *dev, struct device_node *bus_node, void *data); +void i2c_of_probe_simple_cleanup(struct device *dev, void *data); + +extern struct i2c_of_probe_ops i2c_of_probe_simple_ops; + #endif /* IS_ENABLED(CONFIG_OF_DYNAMIC) */ #endif /* _LINUX_I2C_OF_PROBER_H */ -- cgit v1.2.3 From 39b415f84654892003cebb7c026b7daa3380610b Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Wed, 6 Nov 2024 17:33:32 +0800 Subject: i2c: of-prober: Add GPIO support to simple helpers Add GPIO support to the simple helpers for the I2C OF component prober. Components that the prober intends to probe likely require their regulator supplies be enabled, and GPIOs be toggled to enable them or bring them out of reset before they will respond to probe attempts. Regulator supplies were handled in the previous patch. The assumption is that the same class of components to be probed are always connected in the same fashion with the same regulator supply and GPIO. The names may vary due to binding differences, but the physical layout does not change. This supports at most one GPIO pin. The user must specify the GPIO name, the polarity, and the amount of time to wait after the GPIO is toggled. Devices with more than one GPIO pin likely require specific power sequencing beyond what generic code can easily support. Signed-off-by: Chen-Yu Tsai Reviewed-by: Douglas Anderson Reviewed-by: Andy Shevchenko Tested-by: Andrey Skvortsov Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Wolfram Sang --- include/linux/i2c-of-prober.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c-of-prober.h b/include/linux/i2c-of-prober.h index df95aa6ad90e..bb6d47f50ee5 100644 --- a/include/linux/i2c-of-prober.h +++ b/include/linux/i2c-of-prober.h @@ -9,6 +9,7 @@ #define _LINUX_I2C_OF_PROBER_H #include +#include struct device; struct device_node; @@ -85,6 +86,7 @@ int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cf * * The following helpers are provided: * * i2c_of_probe_simple_enable() + * * i2c_of_probe_simple_cleanup_early() * * i2c_of_probe_simple_cleanup() */ @@ -92,14 +94,31 @@ int i2c_of_probe_component(struct device *dev, const struct i2c_of_probe_cfg *cf * struct i2c_of_probe_simple_opts - Options for simple I2C component prober callbacks * @res_node_compatible: Compatible string of device node to retrieve resources from. * @supply_name: Name of regulator supply. + * @gpio_name: Name of GPIO. NULL if no GPIO line is used. Empty string ("") if GPIO + * line is unnamed. * @post_power_on_delay_ms: Delay after regulators are powered on. Passed to msleep(). + * @post_gpio_config_delay_ms: Delay after GPIO is configured. Passed to msleep(). + * @gpio_assert_to_enable: %true if GPIO should be asserted, i.e. set to logical high, + * to enable the component. + * + * This describes power sequences common for the class of components supported by the + * simple component prober: + * * @gpio_name is configured to the non-active setting according to @gpio_assert_to_enable. + * * @supply_name regulator supply is enabled. + * * Wait for @post_power_on_delay_ms to pass. + * * @gpio_name is configured to the active setting according to @gpio_assert_to_enable. + * * Wait for @post_gpio_config_delay_ms to pass. */ struct i2c_of_probe_simple_opts { const char *res_node_compatible; const char *supply_name; + const char *gpio_name; unsigned int post_power_on_delay_ms; + unsigned int post_gpio_config_delay_ms; + bool gpio_assert_to_enable; }; +struct gpio_desc; struct regulator; struct i2c_of_probe_simple_ctx { @@ -107,9 +126,11 @@ struct i2c_of_probe_simple_ctx { const struct i2c_of_probe_simple_opts *opts; /* private: internal fields for helpers. */ struct regulator *supply; + struct gpio_desc *gpiod; }; int i2c_of_probe_simple_enable(struct device *dev, struct device_node *bus_node, void *data); +void i2c_of_probe_simple_cleanup_early(struct device *dev, void *data); void i2c_of_probe_simple_cleanup(struct device *dev, void *data); extern struct i2c_of_probe_ops i2c_of_probe_simple_ops; -- cgit v1.2.3 From 054a9cd395a79f4a5595f2cd24542e9bca8723c8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 20 Nov 2024 08:56:52 +0900 Subject: modpost: rename alias symbol for MODULE_DEVICE_TABLE() This commit renames the alias symbol, __mod____device_table to __mod_device_table____. This change simplifies the code slightly, as there is no longer a need to check both the prefix and suffix. Signed-off-by: Masahiro Yamada --- include/linux/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 88ecc5e9f523..3dd79a3d0cbf 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -247,7 +247,7 @@ extern void cleanup_module(void); #ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ -extern typeof(name) __mod_##type##__##name##_device_table \ +extern typeof(name) __mod_device_table__##type##__##name \ __attribute__ ((unused, alias(__stringify(name)))) #else /* !MODULE */ #define MODULE_DEVICE_TABLE(type, name) -- cgit v1.2.3 From 49b2b973325add467270e906d4d794aa6679b38b Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 19 Nov 2024 14:31:43 +0100 Subject: net: Comment copy_from_sockptr() explaining its behaviour copy_from_sockptr() has a history of misuse. Add a comment explaining that the function follows API of copy_from_user(), i.e. returns 0 for success, or number of bytes not copied on error. Signed-off-by: Michal Luczaj Signed-off-by: Paolo Abeni --- include/linux/sockptr.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 195debe2b1db..3e6c8e9d67ae 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -53,6 +53,8 @@ static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, /* Deprecated. * This is unsafe, unless caller checked user provided optlen. * Prefer copy_safe_from_sockptr() instead. + * + * Returns 0 for success, or number of bytes not copied on error. */ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { -- cgit v1.2.3 From e2668c34b7e1a2288ea0a97ccf3cd12e2870ca18 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 23 Nov 2024 14:50:12 +0000 Subject: net: phy: fix phy_ethtool_set_eee() incorrectly enabling LPI When phy_ethtool_set_eee_noneg() detects a change in the LPI parameters, it attempts to update phylib state and trigger the link to cycle so the MAC sees the updated parameters. However, in doing so, it sets phydev->enable_tx_lpi depending on whether the EEE configuration allows the MAC to generate LPI without taking into account the result of negotiation. This can be demonstrated with a 1000base-T FD interface by: # ethtool --set-eee eno0 advertise 8 # cause EEE to be not negotiated # ethtool --set-eee eno0 tx-lpi off # ethtool --set-eee eno0 tx-lpi on This results in being true, despite EEE not having been negotiated and: # ethtool --show-eee eno0 EEE status: enabled - inactive Tx LPI: 250 (us) Supported EEE link modes: 100baseT/Full 1000baseT/Full Advertised EEE link modes: 100baseT/Full 1000baseT/Full Fix this by keeping track of whether EEE was negotiated via a new eee_active member in struct phy_device, and include this state in the decision whether phydev->enable_tx_lpi should be set. Fixes: 3e43b903da04 ("net: phy: Immediately call adjust_link if only tx_lpi_enabled changes") Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tErSe-005RhB-2R@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 77c6d6451638..563c46205685 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -602,6 +602,7 @@ struct macsec_ops; * @supported_eee: supported PHY EEE linkmodes * @advertising_eee: Currently advertised EEE linkmodes * @enable_tx_lpi: When True, MAC should transmit LPI to PHY + * @eee_active: phylib private state, indicating that EEE has been negotiated * @eee_cfg: User configuration of EEE * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host @@ -723,6 +724,7 @@ struct phy_device { /* Energy efficient ethernet modes which should be prohibited */ __ETHTOOL_DECLARE_LINK_MODE_MASK(eee_broken_modes); bool enable_tx_lpi; + bool eee_active; struct eee_config eee_cfg; /* Host supported PHY interface types. Should be ignored if empty. */ -- cgit v1.2.3 From f9a11da1d92f78279afafdc811214b88cfe54a77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 27 Nov 2024 17:41:56 +0100 Subject: HID: bpf: constify hid_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hid_ops struct is never modified. Mark it as const. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20241127-hid-bpf-ops-v1-1-f9e41bfa3afd@weissschuh.net Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a6876ab29004..a2e47dbcf82c 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -78,7 +78,7 @@ struct hid_ops { const struct bus_type *bus_type; }; -extern struct hid_ops *hid_ops; +extern const struct hid_ops *hid_ops; /** * struct hid_bpf_ops - A BPF struct_ops of callbacks allowing to attach HID-BPF -- cgit v1.2.3 From f69e63756f7822fcdad8a34f9967e8b243e883ee Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 2 Oct 2024 18:31:47 +0100 Subject: printf: Remove unused 'bprintf' bprintf() is unused. Remove it. It was added in the commit 4370aa4aa753 ("vsprintf: add binary printf") but as far as I can see was never used, unlike the other two functions in that patch. Link: https://lore.kernel.org/20241002173147.210107-1-linux@treblig.org Reviewed-by: Andy Shevchenko Acked-by: Petr Mladek Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Steven Rostedt (Google) --- include/linux/string.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 0dd27afcfaf7..493ac4862c77 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -335,7 +335,6 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s); #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); -int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, -- cgit v1.2.3 From e70140ba0d2b1a30467d4af6bcfe761327b9ec95 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Dec 2024 15:12:43 -0800 Subject: Get rid of 'remove_new' relic from platform driver struct The continual trickle of small conversion patches is grating on me, and is really not helping. Just get rid of the 'remove_new' member function, which is just an alias for the plain 'remove', and had a comment to that effect: /* * .remove_new() is a relic from a prototype conversion of .remove(). * New drivers are supposed to implement .remove(). Once all drivers are * converted to not use .remove_new any more, it will be dropped. */ This was just a tree-wide 'sed' script that replaced '.remove_new' with '.remove', with some care taken to turn a subsequent tab into two tabs to make things line up. I did do some minimal manual whitespace adjustment for places that used spaces to line things up. Then I just removed the old (sic) .remove_new member function, and this is the end result. No more unnecessary conversion noise. Signed-off-by: Linus Torvalds --- include/linux/platform_device.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 7132623e4658..074754c23d33 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -235,17 +235,7 @@ extern void platform_device_put(struct platform_device *pdev); struct platform_driver { int (*probe)(struct platform_device *); - - /* - * .remove_new() is a relic from a prototype conversion of .remove(). - * New drivers are supposed to implement .remove(). Once all drivers are - * converted to not use .remove_new any more, it will be dropped. - */ - union { - void (*remove)(struct platform_device *); - void (*remove_new)(struct platform_device *); - }; - + void (*remove)(struct platform_device *); void (*shutdown)(struct platform_device *); int (*suspend)(struct platform_device *, pm_message_t state); int (*resume)(struct platform_device *); -- cgit v1.2.3 From 0302d2fd6efb0c386e521df0134eb2679a9a138f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 27 Nov 2024 09:54:30 +0100 Subject: locking/ww_mutex: Fix ww_mutex dummy lockdep map selftest warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The below commit introduces a dummy lockdep map, but didn't get the initialization quite right (it should mimic the initialization of the real ww_mutex lockdep maps). It also introduced a separate locking api selftest failure. Fix these. Closes: https://lore.kernel.org/lkml/Zw19sMtnKdyOVQoh@boqun-archlinux/ Fixes: 823a566221a5 ("locking/ww_mutex: Adjust to lockdep nest_lock requirements") Reported-by: Boqun Feng Suggested-by: Boqun Feng Signed-off-by: Thomas Hellström Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241127085430.3045-1-thomas.hellstrom@linux.intel.com --- include/linux/ww_mutex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index a401a2f31a77..45ff6f7a872b 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -156,8 +156,8 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, debug_check_no_locks_freed((void *)ctx, sizeof(*ctx)); lockdep_init_map(&ctx->dep_map, ww_class->acquire_name, &ww_class->acquire_key, 0); - lockdep_init_map(&ctx->first_lock_dep_map, ww_class->mutex_name, - &ww_class->mutex_key, 0); + lockdep_init_map_wait(&ctx->first_lock_dep_map, ww_class->mutex_name, + &ww_class->mutex_key, 0, LD_WAIT_SLEEP); mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_); mutex_acquire_nest(&ctx->first_lock_dep_map, 0, 0, &ctx->dep_map, _RET_IP_); #endif -- cgit v1.2.3 From cdd30ebb1b9f36159d66f088b61aee264e649d7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Dec 2024 15:59:47 +0100 Subject: module: Convert symbol namespace to string literal Clean up the existing export namespace code along the same lines of commit 33def8498fdd ("treewide: Convert macro and uses of __section(foo) to __section("foo")") and for the same reason, it is not desired for the namespace argument to be a macro expansion itself. Scripted using git grep -l -e MODULE_IMPORT_NS -e EXPORT_SYMBOL_NS | while read file; do awk -i inplace ' /^#define EXPORT_SYMBOL_NS/ { gsub(/__stringify\(ns\)/, "ns"); print; next; } /^#define MODULE_IMPORT_NS/ { gsub(/__stringify\(ns\)/, "ns"); print; next; } /MODULE_IMPORT_NS/ { $0 = gensub(/MODULE_IMPORT_NS\(([^)]*)\)/, "MODULE_IMPORT_NS(\"\\1\")", "g"); } /EXPORT_SYMBOL_NS/ { if ($0 ~ /(EXPORT_SYMBOL_NS[^(]*)\(([^,]+),/) { if ($0 !~ /(EXPORT_SYMBOL_NS[^(]*)\(([^,]+), ([^)]+)\)/ && $0 !~ /(EXPORT_SYMBOL_NS[^(]*)\(\)/ && $0 !~ /^my/) { getline line; gsub(/[[:space:]]*\\$/, ""); gsub(/[[:space:]]/, "", line); $0 = $0 " " line; } $0 = gensub(/(EXPORT_SYMBOL_NS[^(]*)\(([^,]+), ([^)]+)\)/, "\\1(\\2, \"\\3\")", "g"); } } { print }' $file; done Requested-by: Masahiro Yamada Signed-off-by: Peter Zijlstra (Intel) Link: https://mail.google.com/mail/u/2/#inbox/FMfcgzQXKWgMmjdFwwdsfgxzKpVHWPlc Acked-by: Greg KH Signed-off-by: Linus Torvalds --- include/linux/acpi.h | 2 +- include/linux/export.h | 4 ++-- include/linux/fw_table.h | 2 +- include/linux/module.h | 2 +- include/linux/pm.h | 2 +- include/linux/pwm.h | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 05f39fbfa485..6adcd1b92b20 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -40,7 +40,7 @@ struct irq_domain_ops; #include #ifdef CONFIG_ACPI_TABLE_LIB -#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI) +#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, "ACPI") #define __init_or_acpilib #define __initdata_or_acpilib #else diff --git a/include/linux/export.h b/include/linux/export.h index 0bbd02fd351d..f5f3950a1e42 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -67,7 +67,7 @@ #define EXPORT_SYMBOL(sym) _EXPORT_SYMBOL(sym, "") #define EXPORT_SYMBOL_GPL(sym) _EXPORT_SYMBOL(sym, "GPL") -#define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", __stringify(ns)) -#define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "GPL", __stringify(ns)) +#define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", ns) +#define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "GPL", ns) #endif /* _LINUX_EXPORT_H */ diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h index 3ff4c277296f..9bd605b87c4c 100644 --- a/include/linux/fw_table.h +++ b/include/linux/fw_table.h @@ -54,7 +54,7 @@ int cdat_table_parse(enum acpi_cdat_type type, #define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_ACPI_LIB(x) #define __init_or_fwtbl_lib __init_or_acpilib #else -#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_NS_GPL(x, CXL) +#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_NS_GPL(x, "CXL") #define __init_or_fwtbl_lib #endif diff --git a/include/linux/module.h b/include/linux/module.h index c60ee39cb9b1..94acbacdcdf1 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -296,7 +296,7 @@ extern typeof(name) __mod_device_table__##type##__##name \ * files require multiple MODULE_FIRMWARE() specifiers */ #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) -#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, __stringify(ns)) +#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, ns) struct notifier_block; diff --git a/include/linux/pm.h b/include/linux/pm.h index 97b0e23363c8..e7f0260f15ad 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -385,7 +385,7 @@ const struct dev_pm_ops name = { \ #ifdef CONFIG_PM #define _EXPORT_DEV_PM_OPS(name, license, ns) _EXPORT_PM_OPS(name, license, ns) #define EXPORT_PM_FN_GPL(name) EXPORT_SYMBOL_GPL(name) -#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, ns) +#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, "ns") #else #define _EXPORT_DEV_PM_OPS(name, license, ns) _DISCARD_PM_OPS(name, license, ns) #define EXPORT_PM_FN_GPL(name) diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 78827f312407..6853e29d9674 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -8,7 +8,7 @@ #include #include -MODULE_IMPORT_NS(PWM); +MODULE_IMPORT_NS("PWM"); struct pwm_chip; -- cgit v1.2.3 From a07d2d7930c75e6bf88683b376d09ab1f3fed2aa Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 3 Dec 2024 11:31:05 +0100 Subject: io_uring: Change res2 parameter type in io_uring_cmd_done Change the type of the res2 parameter in io_uring_cmd_done from ssize_t to u64. This aligns the parameter type with io_req_set_cqe32_extra, which expects u64 arguments. The change eliminates potential issues on 32-bit architectures where ssize_t might be 32-bit. Only user of passing res2 is drivers/nvme/host/ioctl.c and it actually passes u64. Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd") Cc: stable@vger.kernel.org Reviewed-by: Kanchan Joshi Tested-by: Li Zetao Reviewed-by: Li Zetao Signed-off-by: Bernd Schubert Link: https://lore.kernel.org/r/20241203-io_uring_cmd_done-res2-as-u64-v2-1-5e59ae617151@ddn.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 578a3fdf5c71..0d5448c0b86c 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -43,7 +43,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, unsigned issue_flags); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -67,7 +67,7 @@ static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, return -EOPNOTSUPP; } static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - ssize_t ret2, unsigned issue_flags) + u64 ret2, unsigned issue_flags) { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, -- cgit v1.2.3 From ceb8bf2ceaa77fe222fe8fe32cb7789c9099ddf1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 Dec 2024 19:21:07 +0900 Subject: module: Convert default symbol namespace to string literal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit cdd30ebb1b9f ("module: Convert symbol namespace to string literal") only converted MODULE_IMPORT_NS() and EXPORT_SYMBOL_NS(), leaving DEFAULT_SYMBOL_NAMESPACE as a macro expansion. This commit converts DEFAULT_SYMBOL_NAMESPACE in the same way to avoid annoyance for the default namespace as well. Signed-off-by: Masahiro Yamada Reviewed-by: Uwe Kleine-König Signed-off-by: Linus Torvalds --- include/linux/export.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index f5f3950a1e42..2633df4d31e6 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -60,7 +60,7 @@ #endif #ifdef DEFAULT_SYMBOL_NAMESPACE -#define _EXPORT_SYMBOL(sym, license) __EXPORT_SYMBOL(sym, license, __stringify(DEFAULT_SYMBOL_NAMESPACE)) +#define _EXPORT_SYMBOL(sym, license) __EXPORT_SYMBOL(sym, license, DEFAULT_SYMBOL_NAMESPACE) #else #define _EXPORT_SYMBOL(sym, license) __EXPORT_SYMBOL(sym, license, "") #endif -- cgit v1.2.3 From 6fe437cfe2cdc797b03f63b338a13fac96ed6a08 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Tue, 3 Dec 2024 14:31:08 +0000 Subject: firmware: arm_ffa: Fix the race around setting ffa_dev->properties Currently, ffa_dev->properties is set after the ffa_device_register() call return in ffa_setup_partitions(). This could potentially result in a race where the partition's properties is accessed while probing struct ffa_device before it is set. Update the ffa_device_register() to receive ffa_partition_info so all the data from the partition information received from the firmware can be updated into the struct ffa_device before the calling device_register() in ffa_device_register(). Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Signed-off-by: Levi Yun Message-Id: <20241203143109.1030514-2-yeoreum.yun@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index a28e2a6a13d0..74169dd0f659 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -166,9 +166,12 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) return dev_get_drvdata(&fdev->dev); } +struct ffa_partition_info; + #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops); +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -176,9 +179,9 @@ void ffa_driver_unregister(struct ffa_driver *driver); bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else -static inline -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +static inline struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { return NULL; } -- cgit v1.2.3 From 76031d9536a076bf023bedbdb1b4317fc801dd67 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Dec 2024 11:16:30 +0100 Subject: clocksource: Make negative motion detection more robust Guenter reported boot stalls on a emulated ARM 32-bit platform, which has a 24-bit wide clocksource. It turns out that the calculated maximal idle time, which limits idle sleeps to prevent clocksource wrap arounds, is close to the point where the negative motion detection triggers. max_idle_ns: 597268854 ns negative motion tripping point: 671088640 ns If the idle wakeup is delayed beyond that point, the clocksource advances far enough to trigger the negative motion detection. This prevents the clock to advance and in the worst case the system stalls completely if the consecutive sleeps based on the stale clock are delayed as well. Cure this by calculating a more robust cut-off value for negative motion, which covers 87.5% of the actual clocksource counter width. Compare the delta against this value to catch negative motion. This is specifically for clock sources with a small counter width as their wrap around time is close to the half counter width. For clock sources with wide counters this is not a problem because the maximum idle time is far from the half counter width due to the math overflow protection constraints. For the case at hand this results in a tripping point of 1174405120ns. Note, that this cannot prevent issues when the delay exceeds the 87.5% margin, but that's not different from the previous unchecked version which allowed arbitrary time jumps. Systems with small counter width are prone to invalid results, but this problem is unlikely to be seen on real hardware. If such a system completely stalls for more than half a second, then there are other more urgent problems than the counter wrapping around. Fixes: c163e40af9b2 ("timekeeping: Always check for negative motion") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Link: https://lore.kernel.org/all/8734j5ul4x.ffs@tglx Closes: https://lore.kernel.org/all/387b120b-d68a-45e8-b6ab-768cd95d11c2@roeck-us.net --- include/linux/clocksource.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index ef1b16da6ad5..65b7c41471c3 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -49,6 +49,7 @@ struct module; * @archdata: Optional arch-specific data * @max_cycles: Maximum safe cycle value which won't overflow on * multiplication + * @max_raw_delta: Maximum safe delta value for negative motion detection * @name: Pointer to clocksource name * @list: List head for registration (internal) * @freq_khz: Clocksource frequency in khz. @@ -109,6 +110,7 @@ struct clocksource { struct arch_clocksource_data archdata; #endif u64 max_cycles; + u64 max_raw_delta; const char *name; struct list_head list; u32 freq_khz; -- cgit v1.2.3 From 4de22b2a6a7477d84d9a01eb6b62a9117309d722 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 20:17:18 +0000 Subject: mm: open-code PageTail in folio_flags() and const_folio_flags() It is unsafe to call PageTail() in dump_page() as page_is_fake_head() will almost certainly return true when called on a head page that is copied to the stack. That will cause the VM_BUG_ON_PGFLAGS() in const_folio_flags() to trigger when it shouldn't. Fortunately, we don't need to call PageTail() here; it's fine to have a pointer to a virtual alias of the page's flag word rather than the real page's flag word. Link: https://lkml.kernel.org/r/20241125201721.2963278-1-willy@infradead.org Fixes: fae7d834c43c ("mm: add __dump_folio()") Signed-off-by: Matthew Wilcox (Oracle) Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2220bfec278e..cf46ac720802 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -306,7 +306,7 @@ static const unsigned long *const_folio_flags(const struct folio *folio, { const struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } @@ -315,7 +315,7 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } -- cgit v1.2.3 From 031e04bdc834cda3b054ef6b698503b2b97e8186 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 22 Nov 2024 16:39:47 +0100 Subject: stackdepot: fix stack_depot_save_flags() in NMI context Per documentation, stack_depot_save_flags() was meant to be usable from NMI context if STACK_DEPOT_FLAG_CAN_ALLOC is unset. However, it still would try to take the pool_lock in an attempt to save a stack trace in the current pool (if space is available). This could result in deadlock if an NMI is handled while pool_lock is already held. To avoid deadlock, only try to take the lock in NMI context and give up if unsuccessful. The documentation is fixed to clearly convey this. Link: https://lkml.kernel.org/r/Z0CcyfbPqmxJ9uJH@elver.google.com Link: https://lkml.kernel.org/r/20241122154051.3914732-1-elver@google.com Fixes: 4434a56ec209 ("stackdepot: make fast paths lock-less again") Signed-off-by: Marco Elver Reported-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index e9ec32fb97d4..2cc21ffcdaf9 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -147,7 +147,7 @@ static inline int stack_depot_early_init(void) { return 0; } * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * - * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if + * Context: Any context, but unsetting STACK_DEPOT_FLAG_CAN_ALLOC is required if * alloc_pages() cannot be used from the current context. Currently * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). @@ -156,7 +156,7 @@ static inline int stack_depot_early_init(void) { return 0; } */ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, unsigned int nr_entries, - gfp_t gfp_flags, + gfp_t alloc_flags, depot_flags_t depot_flags); /** @@ -175,7 +175,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, * Return: Handle of the stack trace stored in depot, 0 on failure */ depot_stack_handle_t stack_depot_save(unsigned long *entries, - unsigned int nr_entries, gfp_t gfp_flags); + unsigned int nr_entries, gfp_t alloc_flags); /** * __stack_depot_get_stack_record - Get a pointer to a stack_record struct -- cgit v1.2.3 From 51f43d5d82ed2ba3f9a3f9a2390c52f28e42af32 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 29 Nov 2024 10:52:13 +0800 Subject: mm/codetag: swap tags when migrate pages Current solution to adjust codetag references during page migration is done in 3 steps: 1. sets the codetag reference of the old page as empty (not pointing to any codetag); 2. subtracts counters of the new page to compensate for its own allocation; 3. sets codetag reference of the new page to point to the codetag of the old page. This does not work if CONFIG_MEM_ALLOC_PROFILING_DEBUG=n because set_codetag_empty() becomes NOOP. Instead, let's simply swap codetag references so that the new page is referencing the old codetag and the old page is referencing the new codetag. This way accounting stays valid and the logic makes more sense. Link: https://lkml.kernel.org/r/20241129025213.34836-1-00107082@163.com Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()") Signed-off-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Acked-by: Suren Baghdasaryan Suggested-by: Suren Baghdasaryan Acked-by: Yu Zhao Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 0e43ab653ab6..3469c4b20105 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -231,7 +231,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) } void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); -void pgalloc_tag_copy(struct folio *new, struct folio *old); +void pgalloc_tag_swap(struct folio *new, struct folio *old); void __init alloc_tag_sec_init(void); @@ -245,7 +245,7 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {} +static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ -- cgit v1.2.3 From d89c8ec0546184267cb211b579514ebaf8916100 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Nov 2024 18:24:06 -0800 Subject: scatterlist: fix incorrect func name in kernel-doc Fix a kernel-doc warning by making the kernel-doc function description match the function name: include/linux/scatterlist.h:323: warning: expecting prototype for sg_unmark_bus_address(). Prototype was for sg_dma_unmark_bus_address() instead Link: https://lkml.kernel.org/r/20241130022406.537973-1-rdunlap@infradead.org Fixes: 42399301203e ("lib/scatterlist: add flag for indicating P2PDMA segments in an SGL") Signed-off-by: Randy Dunlap Cc: Logan Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index c5e2239b550e..d836e7440ee8 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -313,7 +313,7 @@ static inline void sg_dma_mark_bus_address(struct scatterlist *sg) } /** - * sg_unmark_bus_address - Unmark the scatterlist entry as a bus address + * sg_dma_unmark_bus_address - Unmark the scatterlist entry as a bus address * @sg: SG entry * * Description: -- cgit v1.2.3 From b4d83c8323b0c4a899a996fed919cfe10720d289 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Dec 2024 10:19:31 +0100 Subject: headers/cleanup.h: Remove the if_not_guard() facility Linus noticed that the new if_not_guard() definition is fragile: "This macro generates actively wrong code if it happens to be inside an if-statement or a loop without a block. IOW, code like this: for (iterate-over-something) if_not_guard(a) return -BUSY; looks like will build fine, but will generate completely incorrect code." The reason is that the __if_not_guard() macro is multi-statement, so while most kernel developers expect macros to be simple or at least compound statements - but for __if_not_guard() it is not so: #define __if_not_guard(_name, _id, args...) \ BUILD_BUG_ON(!__is_cond_ptr(_name)); \ CLASS(_name, _id)(args); \ if (!__guard_ptr(_name)(&_id)) To add insult to injury, the placement of the BUILD_BUG_ON() line makes the macro appear to compile fine, but it will generate incorrect code as Linus reported, for example if used within iteration or conditional statements that will use the first statement of a macro as a loop body or conditional statement body. [ I'd also like to note that the original submission by David Lechner did not contain the BUILD_BUG_ON() line, so it was safer than what we ended up committing. Mea culpa. ] It doesn't appear to be possible to turn this macro into a robust single or compound statement that could be used in single statements, due to the necessity to define an auto scope variable with an open scope and the necessity of it having to expand to a partial 'if' statement with no body. Instead of trying to work around this fragility, just remove the construct before it gets used. Reported-by: Linus Torvalds Signed-off-by: Ingo Molnar Cc: David Lechner Cc: Peter Zijlstra Link: https://lore.kernel.org/r/Z1LBnX9TpZLR5Dkf@gmail.com --- include/linux/cleanup.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 966fcc5ff8ef..ec00e3f7af2b 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -273,12 +273,6 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * an anonymous instance of the (guard) class, not recommended for * conditional locks. * - * if_not_guard(name, args...) { }: - * convenience macro for conditional guards that calls the statement that - * follows only if the lock was not acquired (typically an error return). - * - * Only for conditional locks. - * * scoped_guard (name, args...) { }: * similar to CLASS(name, scope)(args), except the variable (with the * explicit name 'scope') is declard in a for-loop such that its scope is @@ -350,14 +344,6 @@ _label: \ #define scoped_cond_guard(_name, _fail, args...) \ __scoped_cond_guard(_name, _fail, __UNIQUE_ID(label), args) -#define __if_not_guard(_name, _id, args...) \ - BUILD_BUG_ON(!__is_cond_ptr(_name)); \ - CLASS(_name, _id)(args); \ - if (!__guard_ptr(_name)(&_id)) - -#define if_not_guard(_name, args...) \ - __if_not_guard(_name, __UNIQUE_ID(guard), args) - /* * Additional helper macros for generating lock guards with types, either for * locks that don't have a native type (eg. RCU, preempt) or those that need a -- cgit v1.2.3 From b454abfab52543c44b581afc807b9f97fc1e7a3a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 Dec 2024 16:55:18 +0200 Subject: net: mscc: ocelot: be resilient to loss of PTP packets during transmission The Felix DSA driver presents unique challenges that make the simplistic ocelot PTP TX timestamping procedure unreliable: any transmitted packet may be lost in hardware before it ever leaves our local system. This may happen because there is congestion on the DSA conduit, the switch CPU port or even user port (Qdiscs like taprio may delay packets indefinitely by design). The technical problem is that the kernel, i.e. ocelot_port_add_txtstamp_skb(), runs out of timestamp IDs eventually, because it never detects that packets are lost, and keeps the IDs of the lost packets on hold indefinitely. The manifestation of the issue once the entire timestamp ID range becomes busy looks like this in dmesg: mscc_felix 0000:00:00.5: port 0 delivering skb without TX timestamp mscc_felix 0000:00:00.5: port 1 delivering skb without TX timestamp At the surface level, we need a timeout timer so that the kernel knows a timestamp ID is available again. But there is a deeper problem with the implementation, which is the monotonically increasing ocelot_port->ts_id. In the presence of packet loss, it will be impossible to detect that and reuse one of the holes created in the range of free timestamp IDs. What we actually need is a bitmap of 63 timestamp IDs tracking which one is available. That is able to use up holes caused by packet loss, but also gives us a unique opportunity to not implement an actual timer_list for the timeout timer (very complicated in terms of locking). We could only declare a timestamp ID stale on demand (lazily), aka when there's no other timestamp ID available. There are pros and cons to this approach: the implementation is much more simple than per-packet timers would be, but most of the stale packets would be quasi-leaked - not really leaked, but blocked in driver memory, since this algorithm sees no reason to free them. An improved technique would be to check for stale timestamp IDs every time we allocate a new one. Assuming a constant flux of PTP packets, this avoids stale packets being blocked in memory, but of course, packets lost at the end of the flux are still blocked until the flux resumes (nobody left to kick them out). Since implementing per-packet timers is way too complicated, this should be good enough. Testing procedure: Persistently block traffic class 5 and try to run PTP on it: $ tc qdisc replace dev swp3 parent root taprio num_tc 8 \ map 0 1 2 3 4 5 6 7 queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ base-time 0 sched-entry S 0xdf 100000 flags 0x2 [ 126.948141] mscc_felix 0000:00:00.5: port 3 tc 5 min gate length 0 ns not enough for max frame size 1526 at 1000 Mbps, dropping frames over 1 octets including FCS $ ptp4l -i swp3 -2 -P -m --socket_priority 5 --fault_reset_interval ASAP --logSyncInterval -3 ptp4l[70.351]: port 1 (swp3): INITIALIZING to LISTENING on INIT_COMPLETE ptp4l[70.354]: port 0 (/var/run/ptp4l): INITIALIZING to LISTENING on INIT_COMPLETE ptp4l[70.358]: port 0 (/var/run/ptp4lro): INITIALIZING to LISTENING on INIT_COMPLETE [ 70.394583] mscc_felix 0000:00:00.5: port 3 timestamp id 0 ptp4l[70.406]: timed out while polling for tx timestamp ptp4l[70.406]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[70.406]: port 1 (swp3): send peer delay response failed ptp4l[70.407]: port 1 (swp3): clearing fault immediately ptp4l[70.952]: port 1 (swp3): new foreign master d858d7.fffe.00ca6d-1 [ 71.394858] mscc_felix 0000:00:00.5: port 3 timestamp id 1 ptp4l[71.400]: timed out while polling for tx timestamp ptp4l[71.400]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[71.401]: port 1 (swp3): send peer delay response failed ptp4l[71.401]: port 1 (swp3): clearing fault immediately [ 72.393616] mscc_felix 0000:00:00.5: port 3 timestamp id 2 ptp4l[72.401]: timed out while polling for tx timestamp ptp4l[72.402]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[72.402]: port 1 (swp3): send peer delay response failed ptp4l[72.402]: port 1 (swp3): clearing fault immediately ptp4l[72.952]: port 1 (swp3): new foreign master d858d7.fffe.00ca6d-1 [ 73.395291] mscc_felix 0000:00:00.5: port 3 timestamp id 3 ptp4l[73.400]: timed out while polling for tx timestamp ptp4l[73.400]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[73.400]: port 1 (swp3): send peer delay response failed ptp4l[73.400]: port 1 (swp3): clearing fault immediately [ 74.394282] mscc_felix 0000:00:00.5: port 3 timestamp id 4 ptp4l[74.400]: timed out while polling for tx timestamp ptp4l[74.401]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[74.401]: port 1 (swp3): send peer delay response failed ptp4l[74.401]: port 1 (swp3): clearing fault immediately ptp4l[74.953]: port 1 (swp3): new foreign master d858d7.fffe.00ca6d-1 [ 75.396830] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 0 which seems lost [ 75.405760] mscc_felix 0000:00:00.5: port 3 timestamp id 0 ptp4l[75.410]: timed out while polling for tx timestamp ptp4l[75.411]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[75.411]: port 1 (swp3): send peer delay response failed ptp4l[75.411]: port 1 (swp3): clearing fault immediately (...) Remove the blocking condition and see that the port recovers: $ same tc command as above, but use "sched-entry S 0xff" instead $ same ptp4l command as above ptp4l[99.489]: port 1 (swp3): INITIALIZING to LISTENING on INIT_COMPLETE ptp4l[99.490]: port 0 (/var/run/ptp4l): INITIALIZING to LISTENING on INIT_COMPLETE ptp4l[99.492]: port 0 (/var/run/ptp4lro): INITIALIZING to LISTENING on INIT_COMPLETE [ 100.403768] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 0 which seems lost [ 100.412545] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 1 which seems lost [ 100.421283] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 2 which seems lost [ 100.430015] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 3 which seems lost [ 100.438744] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 4 which seems lost [ 100.447470] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 100.505919] mscc_felix 0000:00:00.5: port 3 timestamp id 0 ptp4l[100.963]: port 1 (swp3): new foreign master d858d7.fffe.00ca6d-1 [ 101.405077] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 101.507953] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 102.405405] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 102.509391] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 103.406003] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 103.510011] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 104.405601] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 104.510624] mscc_felix 0000:00:00.5: port 3 timestamp id 0 ptp4l[104.965]: selected best master clock d858d7.fffe.00ca6d ptp4l[104.966]: port 1 (swp3): assuming the grand master role ptp4l[104.967]: port 1 (swp3): LISTENING to GRAND_MASTER on RS_GRAND_MASTER [ 105.106201] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.232420] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.359001] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.405500] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.485356] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.511220] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.610938] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.737237] mscc_felix 0000:00:00.5: port 3 timestamp id 0 (...) Notice that in this new usage pattern, a non-congested port should basically use timestamp ID 0 all the time, progressing to higher numbers only if there are unacknowledged timestamps in flight. Compare this to the old usage, where the timestamp ID used to monotonically increase modulo OCELOT_MAX_PTP_ID. In terms of implementation, this simplifies the bookkeeping of the ocelot_port :: ts_id and ptp_skbs_in_flight. Since we need to traverse the list of two-step timestampable skbs for each new packet anyway, the information can already be computed and does not need to be stored. Also, ocelot_port->tx_skbs is always accessed under the switch-wide ocelot->ts_id_lock IRQ-unsafe spinlock, so we don't need the skb queue's lock and can use the unlocked primitives safely. This problem was actually detected using the tc-taprio offload, and is causing trouble in TSN scenarios, which Felix (NXP LS1028A / VSC9959) supports but Ocelot (VSC7514) does not. Thus, I've selected the commit to blame as the one adding initial timestamping support for the Felix switch. Fixes: c0bcf537667c ("net: dsa: ocelot: add hardware timestamping support for Felix") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241205145519.1236778-5-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/dsa/ocelot.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 6fbfbde68a37..620a3260fc08 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -15,6 +15,7 @@ struct ocelot_skb_cb { struct sk_buff *clone; unsigned int ptp_class; /* valid only for clones */ + unsigned long ptp_tx_time; /* valid only for clones */ u32 tstamp_lo; u8 ptp_cmd; u8 ts_id; -- cgit v1.2.3 From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:47 -0800 Subject: Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is fully initialized, we can hit the panic below: hv_utils: Registering HyperV Utility Driver hv_vmbus: registering driver hv_utils ... BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1 RIP: 0010:hv_pkt_iter_first+0x12/0xd0 Call Trace: ... vmbus_recvpacket hv_kvp_onchannelcallback vmbus_on_event tasklet_action_common tasklet_action handle_softirqs irq_exit_rcu sysvec_hyperv_stimer0 asm_sysvec_hyperv_stimer0 ... kvp_register_done hvt_op_read vfs_read ksys_read __x64_sys_read This can happen because the KVP/VSS channel callback can be invoked even before the channel is fully opened: 1) as soon as hv_kvp_init() -> hvutil_transport_init() creates /dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and register itself to the driver by writing a message KVP_OP_REGISTER1 to the file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and reading the file for the driver's response, which is handled by hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done(). 2) the problem with kvp_register_done() is that it can cause the channel callback to be called even before the channel is fully opened, and when the channel callback is starting to run, util_probe()-> vmbus_open() may have not initialized the ringbuffer yet, so the callback can hit the panic of NULL pointer dereference. To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in __vmbus_open(), just before the first hv_ringbuffer_init(), and then we unload and reload the driver hv_utils, and run the daemon manually within the 10 seconds. Fix the panic by reordering the steps in util_probe() so the char dev entry used by the KVP or VSS daemon is not created until after vmbus_open() has completed. This reordering prevents the race condition from happening. Reported-by: Dexuan Cui Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration") Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-3-mhklinux@outlook.com> --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 22c22fb91042..02a226bcf0ed 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1559,6 +1559,7 @@ struct hv_util_service { void *channel; void (*util_cb)(void *); int (*util_init)(struct hv_util_service *); + int (*util_init_transport)(void); void (*util_deinit)(void); int (*util_pre_suspend)(void); int (*util_pre_resume)(void); -- cgit v1.2.3 From 8d6712c892019b9b9dc5c7039edd3c9d770b510b Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Dec 2024 10:10:44 +0900 Subject: virtio_ring: add a func argument 'recycle_done' to virtqueue_resize() When virtqueue_resize() has actually recycled all unused buffers, additional work may be required in some cases. Relying solely on its return status is fragile, so introduce a new function argument 'recycle_done', which is invoked when the recycle really occurs. Cc: # v6.11+ Signed-off-by: Koichiro Den Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Signed-off-by: Paolo Abeni --- include/linux/virtio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 57cc4b07fd17..0aa7df4ed5ca 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -109,7 +109,8 @@ dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *vq); dma_addr_t virtqueue_get_used_addr(const struct virtqueue *vq); int virtqueue_resize(struct virtqueue *vq, u32 num, - void (*recycle)(struct virtqueue *vq, void *buf)); + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)); int virtqueue_reset(struct virtqueue *vq, void (*recycle)(struct virtqueue *vq, void *buf)); -- cgit v1.2.3 From 8d2da07c813ad333c20eb803e15f8c4541f25350 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Dec 2024 10:10:46 +0900 Subject: virtio_ring: add a func argument 'recycle_done' to virtqueue_reset() When virtqueue_reset() has actually recycled all unused buffers, additional work may be required in some cases. Relying solely on its return status is fragile, so introduce a new function argument 'recycle_done', which is invoked when it really occurs. Signed-off-by: Koichiro Den Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Signed-off-by: Paolo Abeni --- include/linux/virtio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 0aa7df4ed5ca..dd88682e27e3 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -112,7 +112,8 @@ int virtqueue_resize(struct virtqueue *vq, u32 num, void (*recycle)(struct virtqueue *vq, void *buf), void (*recycle_done)(struct virtqueue *vq)); int virtqueue_reset(struct virtqueue *vq, - void (*recycle)(struct virtqueue *vq, void *buf)); + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)); struct virtio_admin_cmd { __le16 opcode; -- cgit v1.2.3 From b76b840fd93374240b59825f1ab8e2f5c9907acb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 9 Dec 2024 21:23:56 +0900 Subject: dm: Fix dm-zoned-reclaim zone write pointer alignment The zone reclaim processing of the dm-zoned device mapper uses blkdev_issue_zeroout() to align the write pointer of a zone being used for reclaiming another zone, to write the valid data blocks from the zone being reclaimed at the same position relative to the zone start in the reclaim target zone. The first call to blkdev_issue_zeroout() will try to use hardware offload using a REQ_OP_WRITE_ZEROES operation if the device reports a non-zero max_write_zeroes_sectors queue limit. If this operation fails because of the lack of hardware support, blkdev_issue_zeroout() falls back to using a regular write operation with the zero-page as buffer. Currently, such REQ_OP_WRITE_ZEROES failure is automatically handled by the block layer zone write plugging code which will execute a report zones operation to ensure that the write pointer of the target zone of the failed operation has not changed and to "rewind" the zone write pointer offset of the target zone as it was advanced when the write zero operation was submitted. So the REQ_OP_WRITE_ZEROES failure does not cause any issue and blkdev_issue_zeroout() works as expected. However, since the automatic recovery of zone write pointers by the zone write plugging code can potentially cause deadlocks with queue freeze operations, a different recovery must be implemented in preparation for the removal of zone write plugging report zones based recovery. Do this by introducing the new function blk_zone_issue_zeroout(). This function first calls blkdev_issue_zeroout() with the flag BLKDEV_ZERO_NOFALLBACK to intercept failures on the first execution which attempt to use the device hardware offload with the REQ_OP_WRITE_ZEROES operation. If this attempt fails, a report zone operation is issued to restore the zone write pointer offset of the target zone to the correct position and blkdev_issue_zeroout() is called again without the BLKDEV_ZERO_NOFALLBACK flag. The report zones operation performing this recovery is implemented using the helper function disk_zone_sync_wp_offset() which calls the gendisk report_zones file operation with the callback disk_report_zones_cb(). This callback updates the target write pointer offset of the target zone using the new function disk_zone_wplug_sync_wp_offset(). dmz_reclaim_align_wp() is modified to change its call to blkdev_issue_zeroout() to a call to blk_zone_issue_zeroout() without any other change needed as the two functions are functionnally equivalent. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241209122357.47838-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 08a727b40816..4dd698dad2d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1421,6 +1421,9 @@ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) return is_seq; } +int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask); + static inline unsigned int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; -- cgit v1.2.3 From fe0418eb9bd69a19a948b297c8de815e05f3cde1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 9 Dec 2024 21:23:57 +0900 Subject: block: Prevent potential deadlocks in zone write plug error recovery Zone write plugging for handling writes to zones of a zoned block device always execute a zone report whenever a write BIO to a zone fails. The intent of this is to ensure that the tracking of a zone write pointer is always correct to ensure that the alignment to a zone write pointer of write BIOs can be checked on submission and that we can always correctly emulate zone append operations using regular write BIOs. However, this error recovery scheme introduces a potential deadlock if a device queue freeze is initiated while BIOs are still plugged in a zone write plug and one of these write operation fails. In such case, the disk zone write plug error recovery work is scheduled and executes a report zone. This in turn can result in a request allocation in the underlying driver to issue the report zones command to the device. But with the device queue freeze already started, this allocation will block, preventing the report zone execution and the continuation of the processing of the plugged BIOs. As plugged BIOs hold a queue usage reference, the queue freeze itself will never complete, resulting in a deadlock. Avoid this problem by completely removing from the zone write plugging code the use of report zones operations after a failed write operation, instead relying on the device user to either execute a report zones, reset the zone, finish the zone, or give up writing to the device (which is a fairly common pattern for file systems which degrade to read-only after write failures). This is not an unreasonnable requirement as all well-behaved applications, FSes and device mapper already use report zones to recover from write errors whenever possible by comparing the current position of a zone write pointer with what their assumption about the position is. The changes to remove the automatic error recovery are as follows: - Completely remove the error recovery work and its associated resources (zone write plug list head, disk error list, and disk zone_wplugs_work work struct). This also removes the functions disk_zone_wplug_set_error() and disk_zone_wplug_clear_error(). - Change the BLK_ZONE_WPLUG_ERROR zone write plug flag into BLK_ZONE_WPLUG_NEED_WP_UPDATE. This new flag is set for a zone write plug whenever a write opration targetting the zone of the zone write plug fails. This flag indicates that the zone write pointer offset is not reliable and that it must be updated when the next report zone, reset zone, finish zone or disk revalidation is executed. - Modify blk_zone_write_plug_bio_endio() to set the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag for the target zone of a failed write BIO. - Modify the function disk_zone_wplug_set_wp_offset() to clear this new flag, thus implementing recovery of a correct write pointer offset with the reset (all) zone and finish zone operations. - Modify blkdev_report_zones() to always use the disk_report_zones_cb() callback so that disk_zone_wplug_sync_wp_offset() can be called for any zone marked with the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag. This implements recovery of a correct write pointer offset for zone write plugs marked with BLK_ZONE_WPLUG_NEED_WP_UPDATE and within the range of the report zones operation executed by the user. - Modify blk_revalidate_seq_zone() to call disk_zone_wplug_sync_wp_offset() for all sequential write required zones when a zoned block device is revalidated, thus always resolving any inconsistency between the write pointer offset of zone write plugs and the actual write pointer position of sequential zones. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241209122357.47838-5-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4dd698dad2d6..378d3a1a22fc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -200,8 +200,6 @@ struct gendisk { spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; - struct list_head zone_wplugs_err_list; - struct work_struct zone_wplugs_work; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From b238e187b4a2d3b54d80aec05a9cab6466b79dde Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:54 -0800 Subject: bpf: refactor bpf_helper_changes_pkt_data to use helper number Use BPF helper number instead of function pointer in bpf_helper_changes_pkt_data(). This would simplify usage of this function in verifier.c:check_cfg() (in a follow-up patch), where only helper number is easily available and there is no real need to lookup helper proto. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3a21947f2fd4..0477254bc2d3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1122,7 +1122,7 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); -bool bpf_helper_changes_pkt_data(void *func); +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); static inline bool bpf_dump_raw_ok(const struct cred *cred) { -- cgit v1.2.3 From 51081a3f25c742da5a659d7fc6fd77ebfdd555be Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:55 -0800 Subject: bpf: track changes_pkt_data property for global functions When processing calls to certain helpers, verifier invalidates all packet pointers in a current state. For example, consider the following program: __attribute__((__noinline__)) long skb_pull_data(struct __sk_buff *sk, __u32 len) { return bpf_skb_pull_data(sk, len); } SEC("tc") int test_invalidate_checks(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; if ((void *)(p + 1) > (void *)(long)sk->data_end) return TCX_DROP; skb_pull_data(sk, 0); *p = 42; return TCX_PASS; } After a call to bpf_skb_pull_data() the pointer 'p' can't be used safely. See function filter.c:bpf_helper_changes_pkt_data() for a list of such helpers. At the moment verifier invalidates packet pointers when processing helper function calls, and does not traverse global sub-programs when processing calls to global sub-programs. This means that calls to helpers done from global sub-programs do not invalidate pointers in the caller state. E.g. the program above is unsafe, but is not rejected by verifier. This commit fixes the omission by computing field bpf_subprog_info->changes_pkt_data for each sub-program before main verification pass. changes_pkt_data should be set if: - subprogram calls helper for which bpf_helper_changes_pkt_data returns true; - subprogram calls a global function, for which bpf_subprog_info->changes_pkt_data should be set. The verifier.c:check_cfg() pass is modified to compute this information. The commit relies on depth first instruction traversal done by check_cfg() and absence of recursive function calls: - check_cfg() would eventually visit every call to subprogram S in a state when S is fully explored; - when S is fully explored: - every direct helper call within S is explored (and thus changes_pkt_data is set if needed); - every call to subprogram S1 called by S was visited with S1 fully explored (and thus S inherits changes_pkt_data from S1). The downside of such approach is that dead code elimination is not taken into account: if a helper call inside global function is dead because of current configuration, verifier would conservatively assume that the call occurs for the purpose of the changes_pkt_data computation. Reported-by: Nick Zavaritsky Closes: https://lore.kernel.org/bpf/0498CA22-5779-4767-9C0C-A9515CEA711F@gmail.com/ Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f4290c179bee..48b7b2eeb7e2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -659,6 +659,7 @@ struct bpf_subprog_info { bool args_cached: 1; /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; + bool changes_pkt_data: 1; enum priv_stack_mode priv_stack_mode; u8 arg_cnt; -- cgit v1.2.3 From 81f6d0530ba031b5f038a091619bf2ff29568852 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:57 -0800 Subject: bpf: check changes_pkt_data property for extension programs When processing calls to global sub-programs, verifier decides whether to invalidate all packet pointers in current state depending on the changes_pkt_data property of the global sub-program. Because of this, an extension program replacing a global sub-program must be compatible with changes_pkt_data property of the sub-program being replaced. This commit: - adds changes_pkt_data flag to struct bpf_prog_aux: - this flag is set in check_cfg() for main sub-program; - in jit_subprogs() for other sub-programs; - modifies bpf_check_attach_btf_id() to check changes_pkt_data flag; - moves call to check_attach_btf_id() after the call to check_cfg(), because it needs changes_pkt_data flag to be set: bpf_check: ... ... - check_attach_btf_id resolve_pseudo_ldimm64 resolve_pseudo_ldimm64 --> bpf_prog_is_offloaded bpf_prog_is_offloaded check_cfg check_cfg + check_attach_btf_id ... ... The following fields are set by check_attach_btf_id(): - env->ops - prog->aux->attach_btf_trace - prog->aux->attach_func_name - prog->aux->attach_func_proto - prog->aux->dst_trampoline - prog->aux->mod - prog->aux->saved_dst_attach_type - prog->aux->saved_dst_prog_type - prog->expected_attach_type Neither of these fields are used by resolve_pseudo_ldimm64() or bpf_prog_offload_verifier_prep() (for netronome and netdevsim drivers), so the reordering is safe. Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaee2a819f4c..fe392d074973 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1527,6 +1527,7 @@ struct bpf_prog_aux { bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; bool priv_stack_requested; + bool changes_pkt_data; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; -- cgit v1.2.3 From 7d0d673627e20cfa3b21a829a896ce03b58a4f1c Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 10 Dec 2024 20:08:14 +0100 Subject: bpf: Fix theoretical prog_array UAF in __uprobe_perf_func() Currently, the pointer stored in call->prog_array is loaded in __uprobe_perf_func(), with no RCU annotation and no immediately visible RCU protection, so it looks as if the loaded pointer can immediately be dangling. Later, bpf_prog_run_array_uprobe() starts a RCU-trace read-side critical section, but this is too late. It then uses rcu_dereference_check(), but this use of rcu_dereference_check() does not actually dereference anything. Fix it by aligning the semantics to bpf_prog_run_array(): Let the caller provide rcu_read_lock_trace() protection and then load call->prog_array with rcu_dereference_check(). This issue seems to be theoretical: I don't know of any way to reach this code without having handle_swbp() further up the stack, which is already holding a rcu_read_lock_trace() lock, so where we take rcu_read_lock_trace() in __uprobe_perf_func()/bpf_prog_run_array_uprobe() doesn't actually have any effect. Fixes: 8c7dcb84e3b7 ("bpf: implement sleepable uprobes by chaining gps") Suggested-by: Andrii Nakryiko Signed-off-by: Jann Horn Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241210-bpf-fix-uprobe-uaf-v4-1-5fc8959b2b74@google.com --- include/linux/bpf.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe392d074973..805040813f5d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2194,26 +2194,25 @@ bpf_prog_run_array(const struct bpf_prog_array *array, * rcu-protected dynamically sized maps. */ static __always_inline u32 -bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, +bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; - const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; might_fault(); + RCU_LOCKDEP_WARN(!rcu_read_lock_trace_held(), "no rcu lock held"); + + if (unlikely(!array)) + return ret; - rcu_read_lock_trace(); migrate_disable(); run_ctx.is_uprobe = true; - array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); - if (unlikely(!array)) - goto out; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { @@ -2228,9 +2227,7 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, rcu_read_unlock(); } bpf_reset_run_ctx(old_run_ctx); -out: migrate_enable(); - rcu_read_unlock_trace(); return ret; } -- cgit v1.2.3 From d2516c3a53705f783bb6868df0f4a2b977898a71 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 10 Dec 2024 15:12:41 +0100 Subject: net, team, bonding: Add netdev_base_features helper Both bonding and team driver have logic to derive the base feature flags before iterating over their slave devices to refine the set via netdev_increment_features(). Add a small helper netdev_base_features() so this can be reused instead of having it open-coded multiple times. Signed-off-by: Daniel Borkmann Cc: Nikolay Aleksandrov Cc: Ido Schimmel Cc: Jiri Pirko Reviewed-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241210141245.327886-1-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/linux/netdev_features.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 66e7d26b70a4..11be70a7929f 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -253,4 +253,11 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) NETIF_F_GSO_UDP_TUNNEL | \ NETIF_F_GSO_UDP_TUNNEL_CSUM) +static inline netdev_features_t netdev_base_features(netdev_features_t features) +{ + features &= ~NETIF_F_ONE_FOR_ALL; + features |= NETIF_F_ALL_FOR_ALL; + return features; +} + #endif /* _LINUX_NETDEV_FEATURES_H */ -- cgit v1.2.3 From 2f4873f9b5f8a49113045ad91c021347486de323 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:57:27 +0000 Subject: block: Make bio_iov_bvec_set() accept pointer to const iov_iter Make bio_iov_bvec_set() accept a pointer to const iov_iter, which means that we can drop the undesirable casting to struct iov_iter pointer in blk_rq_map_user_bvec(). Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241202115727.2320401-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 60830a6a5939..7a1b3b1a8fed 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -423,7 +423,7 @@ void __bio_add_page(struct bio *bio, struct page *page, void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); -void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); +void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -- cgit v1.2.3 From 0ef8047b737d7480a5d4c46d956e97c190f13050 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 16:15:54 +0100 Subject: x86/static-call: provide a way to do very early static-call updates Add static_call_update_early() for updating static-call targets in very early boot. This will be needed for support of Xen guest type specific hypercall functions. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- include/linux/compiler.h | 37 ++++++++++++++++++++++++++----------- include/linux/static_call.h | 1 + 2 files changed, 27 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495..240c632c5b95 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -216,28 +216,43 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_64BIT +#define ARCH_SEL(a,b) a +#else +#define ARCH_SEL(a,b) b +#endif + /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined * otherwise, or eliminated entirely due to lack of references that are * visible to the compiler. */ -#define ___ADDRESSABLE(sym, __attrs) \ - static void * __used __attrs \ +#define ___ADDRESSABLE(sym, __attrs) \ + static void * __used __attrs \ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -/** - * offset_to_ptr - convert a relative memory offset to an absolute pointer - * @off: the address of the 32-bit offset value - */ -static inline void *offset_to_ptr(const int *off) -{ - return (void *)((unsigned long)off + *off); -} +#define __ADDRESSABLE_ASM(sym) \ + .pushsection .discard.addressable,"aw"; \ + .align ARCH_SEL(8,4); \ + ARCH_SEL(.quad, .long) __stringify(sym); \ + .popsection; -#endif /* __ASSEMBLY__ */ +#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) #ifdef __CHECKER__ #define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 141e6b176a1b..785980af8972 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,6 +138,7 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include +extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ -- cgit v1.2.3 From b53127db1dbf7f1047cf35c10922d801dcd40324 Mon Sep 17 00:00:00 2001 From: "Vineeth Pillai (Google)" Date: Thu, 12 Dec 2024 22:22:36 -0500 Subject: sched/dlserver: Fix dlserver double enqueue dlserver can get dequeued during a dlserver pick_task due to the delayed deueue feature and this can lead to issues with dlserver logic as it still thinks that dlserver is on the runqueue. The dlserver throttling and replenish logic gets confused and can lead to double enqueue of dlserver. Double enqueue of dlserver could happend due to couple of reasons: Case 1 ------ Delayed dequeue feature[1] can cause dlserver being stopped during a pick initiated by dlserver: __pick_next_task pick_task_dl -> server_pick_task pick_task_fair pick_next_entity (if (sched_delayed)) dequeue_entities dl_server_stop server_pick_task goes ahead with update_curr_dl_se without knowing that dlserver is dequeued and this confuses the logic and may lead to unintended enqueue while the server is stopped. Case 2 ------ A race condition between a task dequeue on one cpu and same task's enqueue on this cpu by a remote cpu while the lock is released causing dlserver double enqueue. One cpu would be in the schedule() and releasing RQ-lock: current->state = TASK_INTERRUPTIBLE(); schedule(); deactivate_task() dl_stop_server(); pick_next_task() pick_next_task_fair() sched_balance_newidle() rq_unlock(this_rq) at which point another CPU can take our RQ-lock and do: try_to_wake_up() ttwu_queue() rq_lock() ... activate_task() dl_server_start() --> first enqueue wakeup_preempt() := check_preempt_wakeup_fair() update_curr() update_curr_task() if (current->dl_server) dl_server_update() enqueue_dl_entity() --> second enqueue This bug was not apparent as the enqueue in dl_server_start doesn't usually happen because of the defer logic. But as a side effect of the first case(dequeue during dlserver pick), dl_throttled and dl_yield will be set and this causes the time accounting of dlserver to messup and then leading to a enqueue in dl_server_start. Have an explicit flag representing the status of dlserver to avoid the confusion. This is set in dl_server_start and reset in dlserver_stop. Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers") Suggested-by: Peter Zijlstra Signed-off-by: "Vineeth Pillai (Google)" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # ROCK 5B Link: https://lkml.kernel.org/r/20241213032244.877029-1-vineeth@bitbyteword.org --- include/linux/sched.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d380bffee2ef..66b311fbd5d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -656,6 +656,12 @@ struct sched_dl_entity { * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * + * @dl_server_active tells if the dlserver is active(started). + * dlserver is started on first cfs enqueue on an idle runqueue + * and is stopped when a dequeue results in 0 cfs tasks on the + * runqueue. In other words, dlserver is active only when cpu's + * runqueue has atleast one cfs task. + * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. */ @@ -664,6 +670,7 @@ struct sched_dl_entity { unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; + unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; -- cgit v1.2.3 From c00d738e1673ab801e1577e4e3c780ccf88b1a5b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 13 Dec 2024 14:19:27 -0800 Subject: bpf: Revert "bpf: Mark raw_tp arguments with PTR_MAYBE_NULL" This patch reverts commit cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL"). The patch was well-intended and meant to be as a stop-gap fixing branch prediction when the pointer may actually be NULL at runtime. Eventually, it was supposed to be replaced by an automated script or compiler pass detecting possibly NULL arguments and marking them accordingly. However, it caused two main issues observed for production programs and failed to preserve backwards compatibility. First, programs relied on the verifier not exploring == NULL branch when pointer is not NULL, thus they started failing with a 'dereference of scalar' error. Next, allowing raw_tp arguments to be modified surfaced the warning in the verifier that warns against reg->off when PTR_MAYBE_NULL is set. More information, context, and discusson on both problems is available in [0]. Overall, this approach had several shortcomings, and the fixes would further complicate the verifier's logic, and the entire masking scheme would have to be removed eventually anyway. Hence, revert the patch in preparation of a better fix avoiding these issues to replace this commit. [0]: https://lore.kernel.org/bpf/20241206161053.809580-1-memxor@gmail.com Reported-by: Manu Bretelle Fixes: cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241213221929.3495062-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 805040813f5d..6e63dd3443b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3514,10 +3514,4 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } -static inline bool bpf_prog_is_raw_tp(const struct bpf_prog *prog) -{ - return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_RAW_TP; -} - #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From 239d87327dcd361b0098038995f8908f3296864f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Dec 2024 17:28:06 -0800 Subject: fortify: Hide run-time copy size from value range tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC performs value range tracking for variables as a way to provide better diagnostics. One place this is regularly seen is with warnings associated with bounds-checking, e.g. -Wstringop-overflow, -Wstringop-overread, -Warray-bounds, etc. In order to keep the signal-to-noise ratio high, warnings aren't emitted when a value range spans the entire value range representable by a given variable. For example: unsigned int len; char dst[8]; ... memcpy(dst, src, len); If len's value is unknown, it has the full "unsigned int" range of [0, UINT_MAX], and GCC's compile-time bounds checks against memcpy() will be ignored. However, when a code path has been able to narrow the range: if (len > 16) return; memcpy(dst, src, len); Then the range will be updated for the execution path. Above, len is now [0, 16] when reading memcpy(), so depending on other optimizations, we might see a -Wstringop-overflow warning like: error: '__builtin_memcpy' writing between 9 and 16 bytes into region of size 8 [-Werror=stringop-overflow] When building with CONFIG_FORTIFY_SOURCE, the fortified run-time bounds checking can appear to narrow value ranges of lengths for memcpy(), depending on how the compiler constructs the execution paths during optimization passes, due to the checks against the field sizes. For example: if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) As intentionally designed, these checks only affect the kernel warnings emitted at run-time and do not block the potentially overflowing memcpy(), so GCC thinks it needs to produce a warning about the resulting value range that might be reaching the memcpy(). We have seen this manifest a few times now, with the most recent being with cpumasks: In function ‘bitmap_copy’, inlined from ‘cpumask_copy’ at ./include/linux/cpumask.h:839:2, inlined from ‘__padata_set_cpumasks’ at kernel/padata.c:730:2: ./include/linux/fortify-string.h:114:33: error: ‘__builtin_memcpy’ reading between 257 and 536870904 bytes from a region of size 256 [-Werror=stringop-overread] 114 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:633:9: note: in expansion of macro ‘__underlying_memcpy’ 633 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:678:26: note: in expansion of macro ‘__fortify_memcpy_chk’ 678 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ ./include/linux/bitmap.h:259:17: note: in expansion of macro ‘memcpy’ 259 | memcpy(dst, src, len); | ^~~~~~ kernel/padata.c: In function ‘__padata_set_cpumasks’: kernel/padata.c:713:48: note: source object ‘pcpumask’ of size [0, 256] 713 | cpumask_var_t pcpumask, | ~~~~~~~~~~~~~~^~~~~~~~ This warning is _not_ emitted when CONFIG_FORTIFY_SOURCE is disabled, and with the recent -fdiagnostics-details we can confirm the origin of the warning is due to FORTIFY's bounds checking: ../include/linux/bitmap.h:259:17: note: in expansion of macro 'memcpy' 259 | memcpy(dst, src, len); | ^~~~~~ '__padata_set_cpumasks': events 1-2 ../include/linux/fortify-string.h:613:36: 612 | if (p_size_field != SIZE_MAX && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 613 | p_size != p_size_field && p_size_field < size) | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~ | | | (1) when the condition is evaluated to false | (2) when the condition is evaluated to true '__padata_set_cpumasks': event 3 114 | #define __underlying_memcpy __builtin_memcpy | ^ | | | (3) out of array bounds here Note that the cpumask warning started appearing since bitmap functions were recently marked __always_inline in commit ed8cd2b3bd9f ("bitmap: Switch from inline to __always_inline"), which allowed GCC to gain visibility into the variables as they passed through the FORTIFY implementation. In order to silence these false positives but keep otherwise deterministic compile-time warnings intact, hide the length variable from GCC with OPTIMIZE_HIDE_VAR() before calling the builtin memcpy. Additionally add a comment about why all the macro args have copies with const storage. Reported-by: "Thomas Weißschuh" Closes: https://lore.kernel.org/all/db7190c8-d17f-4a0d-bc2f-5903c79f36c2@t-8ch.de/ Reported-by: Nilay Shroff Closes: https://lore.kernel.org/all/20241112124127.1666300-1-nilay@linux.ibm.com/ Tested-by: Nilay Shroff Acked-by: Yury Norov Acked-by: Greg Kroah-Hartman Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0d99bf11d260..e4ce1cae03bf 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -616,6 +616,12 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, return false; } +/* + * To work around what seems to be an optimizer bug, the macro arguments + * need to have const copies or the values end up changed by the time they + * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture + * __bos() results in const temp vars") for more details. + */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ @@ -623,6 +629,8 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ + /* Keep a mutable version of the size for the final copy. */ \ + size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ @@ -630,7 +638,11 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ - __underlying_##op(p, q, __fortify_size); \ + /* Hide only the run-time size from value range tracking to */ \ + /* silence compile-time false positive bounds warnings. */ \ + if (!__builtin_constant_p(__copy_size)) \ + OPTIMIZER_HIDE_VAR(__copy_size); \ + __underlying_##op(p, q, __copy_size); \ }) /* -- cgit v1.2.3 From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:22 -0500 Subject: tracing: Check "%s" dereference via the field and not the TP_printk format The TP_printk() portion of a trace event is executed at the time a event is read from the trace. This can happen seconds, minutes, hours, days, months, years possibly later since the event was recorded. If the print format contains a dereference to a string via "%s", and that string was allocated, there's a chance that string could be freed before it is read by the trace file. To protect against such bugs, there are two functions that verify the event. The first one is test_event_printk(), which is called when the event is created. It reads the TP_printk() format as well as its arguments to make sure nothing may be dereferencing a pointer that was not copied into the ring buffer along with the event. If it is, it will trigger a WARN_ON(). For strings that use "%s", it is not so easy. The string may not reside in the ring buffer but may still be valid. Strings that are static and part of the kernel proper which will not be freed for the life of the running system, are safe to dereference. But to know if it is a pointer to a static string or to something on the heap can not be determined until the event is triggered. This brings us to the second function that tests for the bad dereferencing of strings, trace_check_vprintf(). It would walk through the printf format looking for "%s", and when it finds it, it would validate that the pointer is safe to read. If not, it would produces a WARN_ON() as well and write into the ring buffer "[UNSAFE-MEMORY]". The problem with this is how it used va_list to have vsnprintf() handle all the cases that it didn't need to check. Instead of re-implementing vsnprintf(), it would make a copy of the format up to the %s part, and call vsnprintf() with the current va_list ap variable, where the ap would then be ready to point at the string in question. For architectures that passed va_list by reference this was possible. For architectures that passed it by copy it was not. A test_can_verify() function was used to differentiate between the two, and if it wasn't possible, it would disable it. Even for architectures where this was feasible, it was a stretch to rely on such a method that is undocumented, and could cause issues later on with new optimizations of the compiler. Instead, the first function test_event_printk() was updated to look at "%s" as well. If the "%s" argument is a pointer outside the event in the ring buffer, it would find the field type of the event that is the problem and mark the structure with a new flag called "needs_test". The event itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that this event has a field that needs to be verified before the event can be printed using the printf format. When the event fields are created from the field type structure, the fields would copy the field type's "needs_test" value. Finally, before being printed, a new function ignore_event() is called which will check if the event has the TEST_STR flag set (if not, it returns false). If the flag is set, it then iterates through the events fields looking for the ones that have the "needs_test" flag set. Then it uses the offset field from the field structure to find the pointer in the ring buffer event. It runs the tests to make sure that pointer is safe to print and if not, it triggers the WARN_ON() and also adds to the trace output that the event in question has an unsafe memory access. The ignore_event() makes the trace_check_vprintf() obsolete so it is removed. Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2a5df5b62cfc..91b8ffbdfa8c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,7 +273,8 @@ struct trace_event_fields { const char *name; const int size; const int align; - const int is_signed; + const unsigned int is_signed:1; + unsigned int needs_test:1; const int filter_type; const int len; }; @@ -324,6 +325,7 @@ enum { TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, + TRACE_EVENT_FL_TEST_STR_BIT, }; /* @@ -340,6 +342,7 @@ enum { * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. + * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), @@ -352,6 +355,7 @@ enum { TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), + TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) -- cgit v1.2.3 From 349f0086ba8b2a169877d21ff15a4d9da3a60054 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 18 Dec 2024 09:02:28 +0100 Subject: x86/static-call: fix 32-bit build In 32-bit x86 builds CONFIG_STATIC_CALL_INLINE isn't set, leading to static_call_initialized not being available. Define it as "0" in that case. Reported-by: Stephen Rothwell Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Signed-off-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- include/linux/static_call.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 785980af8972..78a77a4ae0ea 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,7 +138,6 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include -extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ @@ -161,6 +160,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool #ifdef CONFIG_HAVE_STATIC_CALL_INLINE +extern int static_call_initialized; + extern int __init static_call_init(void); extern void static_call_force_reinit(void); @@ -226,6 +227,8 @@ extern long __static_call_return0(void); #elif defined(CONFIG_HAVE_STATIC_CALL) +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } #define DEFINE_STATIC_CALL(name, _func) \ @@ -282,6 +285,8 @@ extern long __static_call_return0(void); #else /* Generic implementation */ +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } static inline long __static_call_return0(void) -- cgit v1.2.3