From ed2f752e0e0a21d941ca0ee539ef3d4cd576bc5e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 20 Oct 2023 18:19:20 +0200 Subject: x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation Some variables in pcpu_hot, currently current_task and top_of_stack are actually per-thread variables implemented as per-CPU variables and thus stable for the duration of the respective task. There is already an attempt to eliminate redundant reads from these variables using this_cpu_read_stable() asm macro, which hides the dependency on the read memory address. However, the compiler has limited ability to eliminate asm common subexpressions, so this approach results in a limited success. The solution is to allow more aggressive elimination by aliasing pcpu_hot into a const-qualified const_pcpu_hot, and to read stable per-CPU variables from this constant copy. The current per-CPU infrastructure does not support reads from const-qualified variables. However, when the compiler supports segment qualifiers, it is possible to declare the const-aliased variable in the relevant named address space. The compiler considers access to the variable, declared in this way, as a read from a constant location, and will optimize reads from the variable accordingly. By implementing constant-qualified const_pcpu_hot, the compiler can eliminate redundant reads from the constant variables, reducing the number of loads from current_task from 3766 to 3217 on a test build, a -14.6% reduction. The reduction of loads translates to the following code savings: text data bss dec hex filename 25,477,353 4389456 808452 30675261 1d4113d vmlinux-old.o 25,476,074 4389440 808452 30673966 1d40c2e vmlinux-new.o representing a code size reduction of -1279 bytes. [ mingo: Updated the changelog, EXPORT(const_pcpu_hot). ] Co-developed-by: Nadav Amit Signed-off-by: Nadav Amit Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231020162004.135244-1-ubizjak@gmail.com --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index d7779a18b24f..bf9815eaf4aa 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -212,7 +212,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define ___ADDRESSABLE(sym, __attrs) \ static void * __used __attrs \ - __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym; + __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -- cgit v1.2.3 From 92697139b01339b6c0767fa1305a4df9a7c1f37f Mon Sep 17 00:00:00 2001 From: Guanjun Date: Mon, 27 Nov 2023 16:31:27 +0800 Subject: lib/find_bit: Fix the code comments about find_next_bit_wrap The function find_next_bit_wrap only has one memory region to search on. Adjust the comments. Signed-off-by: Guanjun Signed-off-by: Yury Norov --- include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 5e4f39ef2e72..af63ae5b9013 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -413,8 +413,8 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1, } /** - * find_next_bit_wrap - find the next set bit in both memory regions - * @addr: The first address to base the search on + * find_next_bit_wrap - find the next set bit in a memory region + * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * -- cgit v1.2.3 From 27c82f14e6d2bcb9f085bad37fe339227571de60 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 28 Oct 2023 12:05:29 -0700 Subject: lib/find: optimize find_*_bit_wrap When an offset is 0, there's no need to search a bitmap from the beginning after the 1st search failed, because each bit has already been tested. Signed-off-by: Yury Norov --- include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index af63ae5b9013..c69598e383c1 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -405,7 +405,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1, { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); - if (bit < size) + if (bit < size || offset == 0) return bit; bit = find_first_and_bit(addr1, addr2, offset); @@ -427,7 +427,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr, { unsigned long bit = find_next_bit(addr, size, offset); - if (bit < size) + if (bit < size || offset == 0) return bit; bit = find_first_bit(addr, offset); -- cgit v1.2.3 From ea4654e0885348f0faa47f6d7b44a08d75ad16e9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:31 -0800 Subject: x86/bugs: Rename CONFIG_PAGE_TABLE_ISOLATION => CONFIG_MITIGATION_PAGE_TABLE_ISOLATION Step 4/10 of the namespace unification of CPU mitigations related Kconfig options. [ mingo: Converted new uses that got added since the series was posted. ] Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-5-leitao@debian.org --- include/linux/pti.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pti.h b/include/linux/pti.h index 1a941efcaa62..1fbf9d6c20ef 100644 --- a/include/linux/pti.h +++ b/include/linux/pti.h @@ -2,7 +2,7 @@ #ifndef _INCLUDE_PTI_H #define _INCLUDE_PTI_H -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #include #else static inline void pti_init(void) { } -- cgit v1.2.3 From aefb2f2e619b6c334bcb31de830aa00ba0b11129 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:32 -0800 Subject: x86/bugs: Rename CONFIG_RETPOLINE => CONFIG_MITIGATION_RETPOLINE Step 5/10 of the namespace unification of CPU mitigations related Kconfig options. [ mingo: Converted a few more uses in comments/messages as well. ] Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Reviewed-by: Ariel Miculas Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-6-leitao@debian.org --- include/linux/compiler-gcc.h | 2 +- include/linux/indirect_call_wrapper.h | 2 +- include/linux/module.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 2ceba3fe4ec1..d24f29091f4b 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -35,7 +35,7 @@ (typeof(ptr)) (__ptr + (off)); \ }) -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE #define __noretpoline __attribute__((__indirect_branch__("keep"))) #endif diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index c1c76a70a6ce..fe050dab55a3 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -2,7 +2,7 @@ #ifndef _LINUX_INDIRECT_CALL_WRAPPER_H #define _LINUX_INDIRECT_CALL_WRAPPER_H -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE /* * INDIRECT_CALL_$NR - wrapper for indirect calls with $NR known builtin diff --git a/include/linux/module.h b/include/linux/module.h index 9cd0009bd050..087b369e8f17 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -885,7 +885,7 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr, static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE extern bool retpoline_module_ok(bool has_retpoline); #else static inline bool retpoline_module_ok(bool has_retpoline) -- cgit v1.2.3 From ac61d43983a4fe8e3ee600eee44c40868c14340a Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:34 -0800 Subject: x86/bugs: Rename CONFIG_CPU_UNRET_ENTRY => CONFIG_MITIGATION_UNRET_ENTRY Step 7/10 of the namespace unification of CPU mitigations related Kconfig options. Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-8-leitao@debian.org --- include/linux/objtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 33212e93f4a6..d030671a4c49 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -131,7 +131,7 @@ */ .macro VALIDATE_UNRET_BEGIN #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) .Lhere_\@: .pushsection .discard.validate_unret .long .Lhere_\@ - . -- cgit v1.2.3 From a033eec9a06ce25388e71fa1e888792a718b9c17 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 21 Nov 2023 08:07:36 -0800 Subject: x86/bugs: Rename CONFIG_CPU_SRSO => CONFIG_MITIGATION_SRSO Step 9/10 of the namespace unification of CPU mitigations related Kconfig options. Suggested-by: Josh Poimboeuf Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231121160740.1249350-10-leitao@debian.org --- include/linux/objtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index d030671a4c49..b3b8d3dab52d 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -131,7 +131,7 @@ */ .macro VALIDATE_UNRET_BEGIN #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) .Lhere_\@: .pushsection .discard.validate_unret .long .Lhere_\@ - . -- cgit v1.2.3 From f0eb58dd08770a2e24bfc41db5ee3ff7c3a684ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Tue, 16 Jan 2024 13:54:31 -0800 Subject: Input: navpoint - remove driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This driver does not use the SPI core as it should, instead tampering with the SSP registers manually. Refactoring the driver is almost certainly not worth it as the hardware seems to have been designed for and used only in the HP iPAQ hx4700 removed more than a year ago in d6df7df7ae5a ("ARM: pxa: remove unused board files"), so let's remove it. Signed-off-by: Duje Mihanović Link: https://lore.kernel.org/r/20240116-navpoint-removal-v2-2-e566806f1009@skole.hr Signed-off-by: Dmitry Torokhov --- include/linux/input/navpoint.h | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 include/linux/input/navpoint.h (limited to 'include/linux') diff --git a/include/linux/input/navpoint.h b/include/linux/input/navpoint.h deleted file mode 100644 index 5192ae3f5ec1..000000000000 --- a/include/linux/input/navpoint.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 Paul Parsons - */ - -struct navpoint_platform_data { - int port; /* PXA SSP port for pxa_ssp_request() */ -}; -- cgit v1.2.3 From 6df534cc7136fc9e023cbd4e0011a04e3659e74d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Jan 2024 11:32:50 +0100 Subject: spi: make spi_bus_type const Now that the driver core can properly handle constant struct bus_type, move the spi_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Mark Brown Cc: Signed-off-by: Greg Kroah-Hartman Link: https://msgid.link/r/2024010549-erasure-swoop-1cc6@gregkh Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 471fe2ff9066..f306aececeaf 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -36,7 +36,7 @@ struct spi_message; * INTERFACES between SPI master-side drivers and SPI slave protocol handlers, * and SPI infrastructure. */ -extern struct bus_type spi_bus_type; +extern const struct bus_type spi_bus_type; /** * struct spi_statistics - statistics for spi transfers -- cgit v1.2.3 From 73fa7547c70b32cc69685f79be31135797734eb6 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 31 Aug 2020 11:32:08 -0400 Subject: vfs: add RWF_NOAPPEND flag for pwritev2 The pwrite function, originally defined by POSIX (thus the "p"), is defined to ignore O_APPEND and write at the offset passed as its argument. However, historically Linux honored O_APPEND if set and ignored the offset. This cannot be changed due to stability policy, but is documented in the man page as a bug. Now that there's a pwritev2 syscall providing a superset of the pwrite functionality that has a flags argument, the conforming behavior can be offered to userspace via a new flag. Since pwritev2 checks flag validity (in kiocb_set_rw_flags) and reports unknown ones with EOPNOTSUPP, callers will not get wrong behavior on old kernels that don't support the new flag; the error is reported and the caller can decide how to handle it. Signed-off-by: Rich Felker Link: https://lore.kernel.org/r/20200831153207.GO3265@brightrain.aerifal.cx Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..4f7cfda29143 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3335,6 +3335,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; + if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) + return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) @@ -3345,6 +3347,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; + if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { + if (IS_APPEND(file_inode(ki->ki_filp))) + return -EPERM; + ki->ki_flags &= ~IOCB_APPEND; + } + ki->ki_flags |= kiocb_flags; return 0; } -- cgit v1.2.3 From 12f7900c575679af411aaa89340bfe3dc68b46b3 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 19 Jan 2024 04:33:39 +0800 Subject: writeback: move wb_wakeup_delayed defination to fs-writeback.c The wb_wakeup_delayed is only used in fs-writeback.c. Move it to fs-writeback.c after defination of wb_wakeup and make it static. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240118203339.764093-1-shikemeng@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/backing-dev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1a97277f99b1..8e7af9a03b41 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -38,7 +38,6 @@ struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); -void wb_wakeup_delayed(struct bdi_writeback *wb); void wb_wait_for_completion(struct wb_completion *done); -- cgit v1.2.3 From 8e98b87f515d8c4bae521048a037b2cc431c3fd5 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Wed, 17 Jan 2024 14:10:49 +0100 Subject: iio: imu: adis: ensure proper DMA alignment Aligning the buffer to the L1 cache is not sufficient in some platforms as they might have larger cacheline sizes for caches after L1 and thus, we can't guarantee DMA safety. That was the whole reason to introduce IIO_DMA_MINALIGN in [1]. Do the same for the sigma_delta ADCs. [1]: https://lore.kernel.org/linux-iio/20220508175712.647246-2-jic23@kernel.org/ Fixes: ccd2b52f4ac6 ("staging:iio: Add common ADIS library") Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240117-adis-improv-v1-1-7f90e9fad200@analog.com Cc: Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index dc9ea299e088..8898966bc0f0 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -11,6 +11,7 @@ #include #include +#include #include #define ADIS_WRITE_REG(reg) ((0x80 | (reg))) @@ -131,7 +132,7 @@ struct adis { unsigned long irq_flag; void *buffer; - u8 tx[10] ____cacheline_aligned; + u8 tx[10] __aligned(IIO_DMA_MINALIGN); u8 rx[4]; }; -- cgit v1.2.3 From 59598510be1d49e1cff7fd7593293bb8e1b2398b Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Wed, 17 Jan 2024 13:41:03 +0100 Subject: iio: adc: ad_sigma_delta: ensure proper DMA alignment Aligning the buffer to the L1 cache is not sufficient in some platforms as they might have larger cacheline sizes for caches after L1 and thus, we can't guarantee DMA safety. That was the whole reason to introduce IIO_DMA_MINALIGN in [1]. Do the same for the sigma_delta ADCs. [1]: https://lore.kernel.org/linux-iio/20220508175712.647246-2-jic23@kernel.org/ Fixes: 0fb6ee8d0b5e ("iio: ad_sigma_delta: Don't put SPI transfer buffer on the stack") Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240117-dev_sigma_delta_no_irq_flags-v1-1-db39261592cf@analog.com Cc: Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 7852f6c9a714..719cf9cc6e1a 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -8,6 +8,8 @@ #ifndef __AD_SIGMA_DELTA_H__ #define __AD_SIGMA_DELTA_H__ +#include + enum ad_sigma_delta_mode { AD_SD_MODE_CONTINUOUS = 0, AD_SD_MODE_SINGLE = 1, @@ -99,7 +101,7 @@ struct ad_sigma_delta { * 'rx_buf' is up to 32 bits per sample + 64 bit timestamp, * rounded to 16 bytes to take into account padding. */ - uint8_t tx_buf[4] ____cacheline_aligned; + uint8_t tx_buf[4] __aligned(IIO_DMA_MINALIGN); uint8_t rx_buf[16] __aligned(8); }; -- cgit v1.2.3 From 28c5d4e40752fc39507a647b20649c5ca1cf33b7 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 10 Jan 2024 01:14:50 +0000 Subject: of: Add for_each_reserved_child_of_node() We would like to use for_each loop for status = "reserved" nodes. Add for_each_reserved_child_of_node() for it. Signed-off-by: Kuninori Morimoto Tested-by: Yusuke Goda Reviewed-by: Geert Uytterhoeven Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/87a5pegfau.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Geert Uytterhoeven --- include/linux/of.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 6a9ddf20e79a..331e05918f11 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -294,6 +294,8 @@ extern struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev); extern struct device_node *of_get_next_available_child( const struct device_node *node, struct device_node *prev); +extern struct device_node *of_get_next_reserved_child( + const struct device_node *node, struct device_node *prev); extern struct device_node *of_get_compatible_child(const struct device_node *parent, const char *compatible); @@ -541,6 +543,12 @@ static inline struct device_node *of_get_next_available_child( return NULL; } +static inline struct device_node *of_get_next_reserved_child( + const struct device_node *node, struct device_node *prev) +{ + return NULL; +} + static inline struct device_node *of_find_node_with_property( struct device_node *from, const char *prop_name) { @@ -1431,6 +1439,9 @@ static inline int of_property_read_s32(const struct device_node *np, #define for_each_available_child_of_node(parent, child) \ for (child = of_get_next_available_child(parent, NULL); child != NULL; \ child = of_get_next_available_child(parent, child)) +#define for_each_reserved_child_of_node(parent, child) \ + for (child = of_get_next_reserved_child(parent, NULL); child != NULL; \ + child = of_get_next_reserved_child(parent, child)) #define for_each_of_cpu_node(cpu) \ for (cpu = of_get_next_cpu_node(NULL); cpu != NULL; \ -- cgit v1.2.3 From db5914695a84a7b128ec2e4e9272e6e8091753e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jan 2024 11:25:57 +0000 Subject: inet_diag: add module pointer to "struct inet_diag_handler" Following patch is going to use RCU instead of inet_diag_table_mutex acquisition. This patch is a preparation, no change of behavior yet. Signed-off-by: Eric Dumazet Reviewed-by: Guillaume Nault Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- include/linux/inet_diag.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 84abb30a3fbb..a9033696b0aa 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -8,6 +8,7 @@ struct inet_hashinfo; struct inet_diag_handler { + struct module *owner; void (*dump)(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r); -- cgit v1.2.3 From 114b4bb1cc19239b272d52ebbe156053483fe2f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jan 2024 11:25:59 +0000 Subject: sock_diag: add module pointer to "struct sock_diag_handler" Following patch is going to use RCU instead of sock_diag_table_mutex acquisition. This patch is a preparation, no change of behavior yet. Signed-off-by: Eric Dumazet Reviewed-by: Guillaume Nault Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- include/linux/sock_diag.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 0b9ecd8cf979..7c07754d711b 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -13,6 +13,7 @@ struct nlmsghdr; struct sock; struct sock_diag_handler { + struct module *owner; __u8 family; int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh); int (*get_info)(struct sk_buff *skb, struct sock *sk); -- cgit v1.2.3 From 86e8921df05c6e9423ab74ab8d41022775d8b83a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Jan 2024 11:26:01 +0000 Subject: sock_diag: allow concurrent operation in sock_diag_rcv_msg() TCPDIAG_GETSOCK and DCCPDIAG_GETSOCK diag are serialized on sock_diag_table_mutex. This is to make sure inet_diag module is not unloaded while diag was ongoing. It is time to get rid of this mutex and use RCU protection, allowing full parallelism. Signed-off-by: Eric Dumazet Reviewed-by: Guillaume Nault Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- include/linux/sock_diag.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 7c07754d711b..110978dc9af1 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -23,8 +23,13 @@ struct sock_diag_handler { int sock_diag_register(const struct sock_diag_handler *h); void sock_diag_unregister(const struct sock_diag_handler *h); -void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); -void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); +struct sock_diag_inet_compat { + struct module *owner; + int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh); +}; + +void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr); +void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr); u64 __sock_gen_cookie(struct sock *sk); -- cgit v1.2.3 From 42c3732fa8073717dd7d924472f1c0bc5b452fdc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 30 Dec 2023 19:46:00 -0500 Subject: fs: Create a generic is_dot_dotdot() utility De-duplicate the same functionality in several places by hoisting the is_dot_dotdot() utility function into linux/fs.h. Suggested-by: Amir Goldstein Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Acked-by: Christian Brauner Signed-off-by: Chuck Lever --- include/linux/fs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..baa64344a308 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2846,6 +2846,17 @@ extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); +/** + * is_dot_dotdot - returns true only if @name is "." or ".." + * @name: file name to check + * @len: length of file name, in bytes + */ +static inline bool is_dot_dotdot(const char *name, size_t len) +{ + return len && unlikely(name[0] == '.') && + (len == 1 || (len == 2 && name[1] == '.')); +} + #include /* needed for stackable file system support */ -- cgit v1.2.3 From 16c31dd7fdf6c7ec88370928d0baf9f45d13c5a6 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 10 Jan 2024 11:58:17 +0900 Subject: Compiler Attributes: counted_by: bump min gcc version GCC is expected to implement this feature in version 15, so bump the version. Signed-off-by: Sergey Senozhatsky Reviewed-by: Nathan Chancellor Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/e1c27b64ae7abe2ebe647be11b71cf1bca84f677.1704855495.git.senozhatsky@chromium.org Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 28566624f008..215882a1341a 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -95,7 +95,7 @@ #endif /* - * Optional: only supported since gcc >= 14 + * Optional: only supported since gcc >= 15 * Optional: only supported since clang >= 18 * * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 -- cgit v1.2.3 From 2993eb7a8d34aee6165e1f6676e81cdf1d22aa62 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 10 Jan 2024 11:58:18 +0900 Subject: Compiler Attributes: counted_by: fixup clang URL The URL in question 404 now, fix it up (and switch to github). Signed-off-by: Sergey Senozhatsky Reviewed-by: Kees Cook Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/b7babeb9c5b14af9189f0d6225673e6e9a8f4ad3.1704855496.git.senozhatsky@chromium.org Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 215882a1341a..289810685fc5 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -99,7 +99,7 @@ * Optional: only supported since clang >= 18 * * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 - * clang: https://reviews.llvm.org/D148381 + * clang: https://github.com/llvm/llvm-project/pull/76348 */ #if __has_attribute(__counted_by__) # define __counted_by(member) __attribute__((__counted_by__(member))) -- cgit v1.2.3 From 7c05e7f3e74e7e550534d524e04d7e6f78d6fa24 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 5 Jan 2024 18:48:17 +0800 Subject: bpf: Support inlining bpf_kptr_xchg() helper The motivation of inlining bpf_kptr_xchg() comes from the performance profiling of bpf memory allocator benchmark. The benchmark uses bpf_kptr_xchg() to stash the allocated objects and to pop the stashed objects for free. After inling bpf_kptr_xchg(), the performance for object free on 8-CPUs VM increases about 2%~10%. The inline also has downside: both the kasan and kcsan checks on the pointer will be unavailable. bpf_kptr_xchg() can be inlined by converting the calling of bpf_kptr_xchg() into an atomic_xchg() instruction. But the conversion depends on two conditions: 1) JIT backend supports atomic_xchg() on pointer-sized word 2) For the specific arch, the implementation of xchg is the same as atomic_xchg() on pointer-sized words. It seems most 64-bit JIT backends satisfies these two conditions. But as a precaution, defining a weak function bpf_jit_supports_ptr_xchg() to state whether such conversion is safe and only supporting inline for 64-bit host. For x86-64, it supports BPF_XCHG atomic operation and both xchg() and atomic_xchg() use arch_xchg() to implement the exchange, so enabling the inline of bpf_kptr_xchg() on x86-64 first. Reviewed-by: Eduard Zingerman Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20240105104819.3916743-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 68fb6c8142fe..35f067fd3840 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -955,6 +955,7 @@ bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); +bool bpf_jit_supports_ptr_xchg(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); -- cgit v1.2.3 From 522bb2c1f82b12eb7befaae815d1d959b8e6bba2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 Jan 2024 16:09:05 -0800 Subject: bpf: support multiple tags per argument Add ability to iterate multiple decl_tag types pointed to the same function argument. Use this to support multiple __arg_xxx tags per global subprog argument. We leave btf_find_decl_tag_value() intact, but change its implementation to use a new btf_find_next_decl_tag() which can be straightforwardly used to find next BTF type ID of a matching btf_decl_tag type. btf_prepare_func_args() is switched from btf_find_decl_tag_value() to btf_find_next_decl_tag() to gain multiple tags per argument support. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240105000909.2818934-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e30100597d0a..377857b232c6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2472,6 +2472,8 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, int comp_idx, const char *tag_key); +int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt, + int comp_idx, const char *tag_key, int last_id); struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -- cgit v1.2.3 From 32f55dd4add4df1a5bc8febc1fafd3086290dbf6 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 8 Jan 2024 22:51:58 +0200 Subject: bpf: Make bpf_for_each_spilled_reg consider narrow spills Adjust the check in bpf_get_spilled_reg to take into account spilled registers narrower than 64 bits. That allows find_equal_scalars to properly adjust the range of all spilled registers that have the same ID. Before this change, it was possible for a register and a spilled register to have the same IDs but different ranges if the spill was narrower than 64 bits and a range check was performed on the register. Signed-off-by: Maxim Mikityanskiy Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240108205209.838365-5-maxtram95@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d07d857ca67f..e11baecbde68 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -453,7 +453,7 @@ struct bpf_verifier_state { #define bpf_get_spilled_reg(slot, frame, mask) \ (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ - ((1 << frame->stack[slot].slot_type[0]) & (mask))) \ + ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \ ? &frame->stack[slot].spilled_ptr : NULL) /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ -- cgit v1.2.3 From 3b1f89e747cd4b24244f2798a35d28815b744303 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 19 Jan 2024 14:49:52 -0800 Subject: bpf: refactory struct_ops type initialization to a function. Move the majority of the code to bpf_struct_ops_init_one(), which can then be utilized for the initialization of newly registered dynamically allocated struct_ops types in the following patches. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240119225005.668602-2-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index cf5c6ff48981..932af1680bb5 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -137,6 +137,7 @@ struct btf_struct_metas { extern const struct file_operations btf_fops; +const char *btf_get_name(const struct btf *btf); void btf_get(struct btf *btf); void btf_put(struct btf *btf); int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); -- cgit v1.2.3 From 4c5763ed996a61b51d721d0968d0df957826ea49 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 19 Jan 2024 14:49:54 -0800 Subject: bpf, net: introduce bpf_struct_ops_desc. Move some of members of bpf_struct_ops to bpf_struct_ops_desc. type_id is unavailabe in bpf_struct_ops anymore. Modules should get it from the btf received by kmod's init function. Cc: netdev@vger.kernel.org Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240119225005.668602-4-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 377857b232c6..7fc95e7babab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1673,18 +1673,23 @@ struct bpf_struct_ops { void (*unreg)(void *kdata); int (*update)(void *kdata, void *old_kdata); int (*validate)(void *kdata); - const struct btf_type *type; - const struct btf_type *value_type; + void *cfi_stubs; const char *name; struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; +}; + +struct bpf_struct_ops_desc { + struct bpf_struct_ops *st_ops; + + const struct btf_type *type; + const struct btf_type *value_type; u32 type_id; u32 value_id; - void *cfi_stubs; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) -const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id); +const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id); void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log); bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); @@ -1728,7 +1733,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif #else -static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) +static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id) { return NULL; } -- cgit v1.2.3 From 1338b93346587a2a6ac79bbcf55ef5b357745573 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 19 Jan 2024 14:49:57 -0800 Subject: bpf: pass btf object id in bpf_map_info. Include btf object id (btf_obj_id) in bpf_map_info so that tools (ex: bpftools struct_ops dump) know the correct btf from the kernel to look up type information of struct_ops types. Since struct_ops types can be defined and registered in a module. The type information of a struct_ops type are defined in the btf of the module defining it. The userspace tools need to know which btf is for the module defining a struct_ops type. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240119225005.668602-7-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7fc95e7babab..29fcae9fa8ed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1732,6 +1732,7 @@ struct bpf_dummy_ops { int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif +void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map); #else static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id) { @@ -1759,6 +1760,9 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } +static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ +} #endif -- cgit v1.2.3 From 689423db3bda2244c24db8a64de4cdb37be1de41 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 19 Jan 2024 14:49:58 -0800 Subject: bpf: lookup struct_ops types from a given module BTF. This is a preparation for searching for struct_ops types from a specified module. BTF is always btf_vmlinux now. This patch passes a pointer of BTF to bpf_struct_ops_find_value() and bpf_struct_ops_find(). Once the new registration API of struct_ops types is used, other BTFs besides btf_vmlinux can also be passed to them. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240119225005.668602-8-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 29fcae9fa8ed..86ff8911d7ee 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1689,7 +1689,7 @@ struct bpf_struct_ops_desc { #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) -const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id); +const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id); void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log); bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); @@ -1734,7 +1734,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, #endif void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map); #else -static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id) +static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id) { return NULL; } -- cgit v1.2.3 From e3f87fdfed7b770dd7066b02262b12747881e76d Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 19 Jan 2024 14:50:00 -0800 Subject: bpf: hold module refcnt in bpf_struct_ops map creation and prog verification. To ensure that a module remains accessible whenever a struct_ops object of a struct_ops type provided by the module is still in use. struct bpf_struct_ops_map doesn't hold a refcnt to btf anymore since a module will hold a refcnt to it's btf already. But, struct_ops programs are different. They hold their associated btf, not the module since they need only btf to assure their types (signatures). However, verifier holds the refcnt of the associated module of a struct_ops type temporarily when verify a struct_ops prog. Verifier needs the help from the verifier operators (struct bpf_verifier_ops) provided by the owner module to verify data access of a prog, provide information, and generate code. This patch also add a count of links (links_cnt) to bpf_struct_ops_map. It avoids bpf_struct_ops_map_put_progs() from accessing btf after calling module_put() in bpf_struct_ops_map_free(). Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240119225005.668602-10-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 86ff8911d7ee..a5b425893d38 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1674,6 +1674,7 @@ struct bpf_struct_ops { int (*update)(void *kdata, void *old_kdata); int (*validate)(void *kdata); void *cfi_stubs; + struct module *owner; const char *name; struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e11baecbde68..7f5816482a10 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -662,6 +662,7 @@ struct bpf_verifier_env { u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ const struct bpf_verifier_ops *ops; + struct module *attach_btf_mod; /* The owner module of prog->aux->attach_btf */ struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ -- cgit v1.2.3 From 612d087d4ba54cef47946e22e5dabad762dd7ed5 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 19 Jan 2024 14:50:01 -0800 Subject: bpf: validate value_type A value_type should consist of three components: refcnt, state, and data. refcnt and state has been move to struct bpf_struct_ops_common_value to make it easier to check the value type. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240119225005.668602-11-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a5b425893d38..7c178170f93f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1688,6 +1688,18 @@ struct bpf_struct_ops_desc { u32 value_id; }; +enum bpf_struct_ops_state { + BPF_STRUCT_OPS_STATE_INIT, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE, + BPF_STRUCT_OPS_STATE_READY, +}; + +struct bpf_struct_ops_common_value { + refcount_t refcnt; + enum bpf_struct_ops_state state; +}; + #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id); -- cgit v1.2.3 From f6be98d19985411ca1f3d53413d94d5b7f41c200 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Fri, 19 Jan 2024 14:50:02 -0800 Subject: bpf, net: switch to dynamic registration Replace the static list of struct_ops types with per-btf struct_ops_tab to enable dynamic registration. Both bpf_dummy_ops and bpf_tcp_ca now utilize the registration function instead of being listed in bpf_struct_ops_types.h. Cc: netdev@vger.kernel.org Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240119225005.668602-12-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 27 +++++++++++++++++---------- include/linux/btf.h | 12 ++++++++++++ 2 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7c178170f93f..75b7f9b19c6a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1701,9 +1701,20 @@ struct bpf_struct_ops_common_value { }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +/* This macro helps developer to register a struct_ops type and generate + * type information correctly. Developers should use this macro to register + * a struct_ops type instead of calling __register_bpf_struct_ops() directly. + */ +#define register_bpf_struct_ops(st_ops, type) \ + ({ \ + struct bpf_struct_ops_##type { \ + struct bpf_struct_ops_common_value common; \ + struct type data ____cacheline_aligned_in_smp; \ + }; \ + BTF_TYPE_EMIT(struct bpf_struct_ops_##type); \ + __register_bpf_struct_ops(st_ops); \ + }) #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) -const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id); -void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log); bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, @@ -1745,16 +1756,12 @@ struct bpf_dummy_ops { int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); #endif +int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, + struct btf *btf, + struct bpf_verifier_log *log); void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map); #else -static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id) -{ - return NULL; -} -static inline void bpf_struct_ops_init(struct btf *btf, - struct bpf_verifier_log *log) -{ -} +#define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; }) static inline bool bpf_try_module_get(const void *data, struct module *owner) { return try_module_get(owner); diff --git a/include/linux/btf.h b/include/linux/btf.h index 932af1680bb5..1ee8977b8c95 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -497,6 +497,18 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) struct bpf_verifier_log; +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) +struct bpf_struct_ops; +int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops); +const struct bpf_struct_ops_desc *bpf_struct_ops_find_value(struct btf *btf, u32 value_id); +const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id); +#else +static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); -- cgit v1.2.3 From 3f3174996be6b4312c38f54d5969f5d5b75fec9e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:13:59 -0600 Subject: RAS: Introduce AMD Address Translation Library AMD Zen-based systems report memory errors through Machine Check banks representing Unified Memory Controllers (UMCs). The address value reported for DRAM ECC errors is a "normalized address" that is relative to the UMC. This normalized address must be converted to a system physical address to be usable by the OS. Support for this address translation was introduced to the MCA subsystem with Zen1 systems. The code was later moved to the AMD64 EDAC module, since this was the only user of the code at the time. However, there are uses for this translation outside of EDAC. The system physical address can be used in MCA for preemptive page offlining as done in some MCA notifier functions. Also, this translation is needed as the basis of similar functionality needed for some CXL configurations on AMD systems. Introduce a common address translation library that can be used for multiple subsystems including MCA, EDAC, and CXL. Include support for UMC normalized to system physical address translation for current CPU systems. The Data Fabric Indirect register access offsets and one of the register fields were changed. Default to the current offsets and register field definition. And fallback to the older values if running on a "legacy" system. Provide built-in code to facilitate the loading and unloading of the library module without affecting other modules or built-in code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-2-yazen.ghannam@amd.com --- include/linux/ras.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ras.h b/include/linux/ras.h index 1f4048bf2674..09c632832bf1 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -25,6 +25,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len); void log_arm_hw_error(struct cper_sec_proc_arm *err); + #else static inline void log_non_standard_event(const guid_t *sec_type, @@ -35,4 +36,19 @@ static inline void log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } #endif +struct atl_err { + u64 addr; + u64 ipid; + u32 cpu; +}; + +#if IS_ENABLED(CONFIG_AMD_ATL) +void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)); +void amd_atl_unregister_decoder(void); +unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err); +#else +static inline unsigned long +amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } +#endif /* CONFIG_AMD_ATL */ + #endif /* __RAS_H__ */ -- cgit v1.2.3 From d22083a5f09b2066728a91f3abb71284451247b1 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 22 Jan 2024 16:57:13 +0800 Subject: irqchip/gic(v3): Replace gic_irq() with irqd_to_hwirq() GIC & GIC-v3 share same gic_irq() implementations, both of which serve exact same purpose as irqd_to_hwirq(). irqd_to_hwirq() is a generic and top level API of the interrupt subsystem, it's independent of any chip implementation. Replace gic_irq() with irqd_to_hwirq() and convert struct irq_data::hwirq to irq_hw_number_t explicitly. Suggested-by: Marc Zyngier Signed-off-by: Dawei Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240122085716.2999875-3-dawei.li@shingroup.cn --- include/linux/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 90081afa10ce..97baa937ab5b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -179,7 +179,7 @@ struct irq_common_data { struct irq_data { u32 mask; unsigned int irq; - unsigned long hwirq; + irq_hw_number_t hwirq; struct irq_common_data *common; struct irq_chip *chip; struct irq_domain *domain; -- cgit v1.2.3 From 9676635685fe348003a29948d9726e5d9e4b4a6e Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 22 Jan 2024 16:57:14 +0800 Subject: genirq: Remove unneeded forward declaration The protoype of irq_flow_handler_t is independent of irq_data, so remove unneeded forward declaration. Signed-off-by: Dawei Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240122085716.2999875-4-dawei.li@shingroup.cn --- include/linux/irqhandler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqhandler.h b/include/linux/irqhandler.h index c30f454a9518..72dd1eb3a0e7 100644 --- a/include/linux/irqhandler.h +++ b/include/linux/irqhandler.h @@ -8,7 +8,7 @@ */ struct irq_desc; -struct irq_data; + typedef void (*irq_flow_handler_t)(struct irq_desc *desc); #endif -- cgit v1.2.3 From 71a5849aedaa9ea028fc51ee74576cad61954743 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 29 Sep 2023 21:11:57 +0000 Subject: mm: Change mmap_rnd_bits_max to __ro_after_init Allow mmap_rnd_bits_max to be updated on architectures that determine virtual address space size at runtime instead of relying on Kconfig options by changing it from const to __ro_after_init. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20230929211155.3910949-5-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..2488c0c5a288 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -86,7 +86,7 @@ extern int sysctl_legacy_va_layout; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; -extern const int mmap_rnd_bits_max; +extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS -- cgit v1.2.3 From b7dbaace39713025f1fd33407c89651a0c09f667 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 24 Jan 2024 16:29:33 +0100 Subject: fsnotify: Add fsnotify_sb_has_watchers() helper Instead of opencoded checks for number of fsnotify connectors add a helper fsnotify_sb_has_watchers(). Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 8300a5286988..1a9de119a0f7 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -17,6 +17,12 @@ #include #include +/* Are there any inode/mount/sb objects that are being watched at all? */ +static inline bool fsnotify_sb_has_watchers(struct super_block *sb) +{ + return atomic_long_read(&sb->s_fsnotify_connectors); +} + /* * Notify this @dir inode about a change in a child directory entry. * The directory entry may have turned positive or negative or its inode may @@ -30,7 +36,7 @@ static inline int fsnotify_name(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { - if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(dir->i_sb)) return 0; return fsnotify(mask, data, data_type, dir, name, NULL, cookie); @@ -44,7 +50,7 @@ static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, static inline void fsnotify_inode(struct inode *inode, __u32 mask) { - if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(inode->i_sb)) return; if (S_ISDIR(inode->i_mode)) @@ -59,7 +65,7 @@ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask, { struct inode *inode = d_inode(dentry); - if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0) + if (!fsnotify_sb_has_watchers(inode->i_sb)) return 0; if (S_ISDIR(inode->i_mode)) { -- cgit v1.2.3 From b017500ab53c06441ff7d3a681484e37039b4f57 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 22 Jan 2024 17:11:26 +0100 Subject: PM: sleep: Use bool for all 1-bit fields in struct dev_pm_info For some 1-bit fields in struct dev_pm_info the data type is bool, while for some other 1-bit fields in there it is unsigned int, and these differences are somewhat arbitrary. For consistency, change the data type of the latter to bool, so that all of the 1-bit fields in struct dev_pm_info fields are bool. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Greg Kroah-Hartman --- include/linux/pm.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index a2f3e53a8196..97b0e23363c8 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -662,8 +662,8 @@ struct pm_subsys_data { struct dev_pm_info { pm_message_t power_state; - unsigned int can_wakeup:1; - unsigned int async_suspend:1; + bool can_wakeup:1; + bool async_suspend:1; bool in_dpm_list:1; /* Owned by the PM core */ bool is_prepared:1; /* Owned by the PM core */ bool is_suspended:1; /* Ditto */ @@ -682,10 +682,10 @@ struct dev_pm_info { bool syscore:1; bool no_pm_callbacks:1; /* Owned by the PM core */ bool async_in_progress:1; /* Owned by the PM core */ - unsigned int must_resume:1; /* Owned by the PM core */ - unsigned int may_skip_resume:1; /* Set by subsystems */ + bool must_resume:1; /* Owned by the PM core */ + bool may_skip_resume:1; /* Set by subsystems */ #else - unsigned int should_wakeup:1; + bool should_wakeup:1; #endif #ifdef CONFIG_PM struct hrtimer suspend_timer; @@ -696,17 +696,17 @@ struct dev_pm_info { atomic_t usage_count; atomic_t child_count; unsigned int disable_depth:3; - unsigned int idle_notification:1; - unsigned int request_pending:1; - unsigned int deferred_resume:1; - unsigned int needs_force_resume:1; - unsigned int runtime_auto:1; + bool idle_notification:1; + bool request_pending:1; + bool deferred_resume:1; + bool needs_force_resume:1; + bool runtime_auto:1; bool ignore_children:1; - unsigned int no_callbacks:1; - unsigned int irq_safe:1; - unsigned int use_autosuspend:1; - unsigned int timer_autosuspends:1; - unsigned int memalloc_noio:1; + bool no_callbacks:1; + bool irq_safe:1; + bool use_autosuspend:1; + bool timer_autosuspends:1; + bool memalloc_noio:1; unsigned int links_count; enum rpm_request request; enum rpm_status runtime_status; -- cgit v1.2.3 From 6fe01d3cbb924a72493eb3f4722dfcfd1c194234 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:20:59 -0800 Subject: bpf: Add BPF token delegation mount options to BPF FS Add few new mount options to BPF FS that allow to specify that a given BPF FS instance allows creation of BPF token (added in the next patch), and what sort of operations are allowed under BPF token. As such, we get 4 new mount options, each is a bit mask - `delegate_cmds` allow to specify which bpf() syscall commands are allowed with BPF token derived from this BPF FS instance; - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies a set of allowable BPF map types that could be created with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies a set of allowable BPF program types that could be loaded with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies a set of allowable BPF program attach types that could be loaded with BPF token; delegate_progs and delegate_attachs are meant to be used together, as full BPF program type is, in general, determined through both program type and program attach type. Currently, these mount options accept the following forms of values: - a special value "any", that enables all possible values of a given bit set; - numeric value (decimal or hexadecimal, determined by kernel automatically) that specifies a bit mask value directly; - all the values for a given mount option are combined, if specified multiple times. E.g., `mount -t bpf nodev /path/to/mount -o delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3 mask. Ideally, more convenient (for humans) symbolic form derived from corresponding UAPI enums would be accepted (e.g., `-o delegate_progs=kprobe|tracepoint`) and I intend to implement this, but it requires a bunch of UAPI header churn, so I postponed it until this feature lands upstream or at least there is a definite consensus that this feature is acceptable and is going to make it, just to minimize amount of wasted effort and not increase amount of non-essential code to be reviewed. Attentive reader will notice that BPF FS is now marked as FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init user namespace as long as the process has sufficient *namespaced* capabilities within that user namespace. But in reality we still restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added to allow creating BPF FS context object (i.e., fsopen("bpf")) from inside unprivileged process inside non-init userns, to capture that userns as the owning userns. It will still be required to pass this context object back to privileged process to instantiate and mount it. This manipulation is important, because capturing non-init userns as the owning userns of BPF FS instance (super block) allows to use that userns to constraint BPF token to that userns later on (see next patch). So creating BPF FS with delegation inside unprivileged userns will restrict derived BPF token objects to only "work" inside that intended userns, making it scoped to a intended "container". Also, setting these delegation options requires capable(CAP_SYS_ADMIN), so unprivileged process cannot set this up without involvement of a privileged process. There is a set of selftests at the end of the patch set that simulates this sequence of steps and validates that everything works as intended. But careful review is requested to make sure there are no missed gaps in the implementation and testing. This somewhat subtle set of aspects is the result of previous discussions ([0]) about various user namespace implications and interactions with BPF token functionality and is necessary to contain BPF token inside intended user namespace. [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/ Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Christian Brauner Link: https://lore.kernel.org/bpf/20240124022127.2379740-3-andrii@kernel.org --- include/linux/bpf.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 75b7f9b19c6a..28374cec49df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1609,6 +1609,18 @@ struct bpf_link_primer { u32 id; }; +struct bpf_mount_opts { + kuid_t uid; + kgid_t gid; + umode_t mode; + + /* BPF token-related delegation options */ + u64 delegate_cmds; + u64 delegate_maps; + u64 delegate_progs; + u64 delegate_attachs; +}; + struct bpf_struct_ops_value; struct btf_member; -- cgit v1.2.3 From 35f96de04127d332a5c5e8a155d31f452f88c76d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:21:00 -0800 Subject: bpf: Introduce BPF token object Add new kind of BPF kernel object, BPF token. BPF token is meant to allow delegating privileged BPF functionality, like loading a BPF program or creating a BPF map, from privileged process to a *trusted* unprivileged process, all while having a good amount of control over which privileged operations could be performed using provided BPF token. This is achieved through mounting BPF FS instance with extra delegation mount options, which determine what operations are delegatable, and also constraining it to the owning user namespace (as mentioned in the previous patch). BPF token itself is just a derivative from BPF FS and can be created through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF FS FD, which can be attained through open() API by opening BPF FS mount point. Currently, BPF token "inherits" delegated command, map types, prog type, and attach type bit sets from BPF FS as is. In the future, having an BPF token as a separate object with its own FD, we can allow to further restrict BPF token's allowable set of things either at the creation time or after the fact, allowing the process to guard itself further from unintentionally trying to load undesired kind of BPF programs. But for now we keep things simple and just copy bit sets as is. When BPF token is created from BPF FS mount, we take reference to the BPF super block's owning user namespace, and then use that namespace for checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN} capabilities that are normally only checked against init userns (using capable()), but now we check them using ns_capable() instead (if BPF token is provided). See bpf_token_capable() for details. Such setup means that BPF token in itself is not sufficient to grant BPF functionality. User namespaced process has to *also* have necessary combination of capabilities inside that user namespace. So while previously CAP_BPF was useless when granted within user namespace, now it gains a meaning and allows container managers and sys admins to have a flexible control over which processes can and need to use BPF functionality within the user namespace (i.e., container in practice). And BPF FS delegation mount options and derived BPF tokens serve as a per-container "flag" to grant overall ability to use bpf() (plus further restrict on which parts of bpf() syscalls are treated as namespaced). Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF) within the BPF FS owning user namespace, rounding up the ns_capable() story of BPF token. Also creating BPF token in init user namespace is currently not supported, given BPF token doesn't have any effect in init user namespace anyways. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Christian Brauner Link: https://lore.kernel.org/bpf/20240124022127.2379740-4-andrii@kernel.org --- include/linux/bpf.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 28374cec49df..d9ff7ce547b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -52,6 +52,10 @@ struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; +struct bpf_token; +struct user_namespace; +struct super_block; +struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1621,6 +1625,13 @@ struct bpf_mount_opts { u64 delegate_attachs; }; +struct bpf_token { + struct work_struct work; + atomic64_t refcnt; + struct user_namespace *userns; + u64 allowed_cmds; +}; + struct bpf_struct_ops_value; struct btf_member; @@ -2109,6 +2120,7 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; @@ -2243,6 +2255,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map) extern int sysctl_unprivileged_bpf_disabled; +bool bpf_token_capable(const struct bpf_token *token, int cap); + static inline bool bpf_allow_ptr_leaks(void) { return perfmon_capable(); @@ -2277,8 +2291,17 @@ int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); +void bpf_token_inc(struct bpf_token *token); +void bpf_token_put(struct bpf_token *token); +int bpf_token_create(union bpf_attr *attr); +struct bpf_token *bpf_token_get_from_fd(u32 ufd); + +bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); + int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); +struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, + umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ @@ -2638,6 +2661,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } +static inline bool bpf_token_capable(const struct bpf_token *token, int cap) +{ + return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); +} + +static inline void bpf_token_inc(struct bpf_token *token) +{ +} + +static inline void bpf_token_put(struct bpf_token *token) +{ +} + +static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline void __dev_flush(void) { } -- cgit v1.2.3 From a177fc2bf6fd83704854feaf7aae926b1df4f0b9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:21:01 -0800 Subject: bpf: Add BPF token support to BPF_MAP_CREATE command Allow providing token_fd for BPF_MAP_CREATE command to allow controlled BPF map creation from unprivileged process through delegated BPF token. New BPF_F_TOKEN_FD flag is added to specify together with BPF token FD for BPF_MAP_CREATE command. Wire through a set of allowed BPF map types to BPF token, derived from BPF FS at BPF token creation time. This, in combination with allowed_cmds allows to create a narrowly-focused BPF token (controlled by privileged agent) with a restrictive set of BPF maps that application can attempt to create. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20240124022127.2379740-5-andrii@kernel.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d9ff7ce547b4..8252452d0c4d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1630,6 +1630,7 @@ struct bpf_token { atomic64_t refcnt; struct user_namespace *userns; u64 allowed_cmds; + u64 allowed_maps; }; struct bpf_struct_ops_value; @@ -2297,6 +2298,7 @@ int bpf_token_create(union bpf_attr *attr); struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); -- cgit v1.2.3 From caf8f28e036c4ba1e823355da6c0c01c39e70ab9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:21:03 -0800 Subject: bpf: Add BPF token support to BPF_PROG_LOAD command Add basic support of BPF token to BPF_PROG_LOAD. BPF_F_TOKEN_FD flag should be set in prog_flags field when providing prog_token_fd. Wire through a set of allowed BPF program types and attach types, derived from BPF FS at BPF token creation time. Then make sure we perform bpf_token_capable() checks everywhere where it's relevant. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20240124022127.2379740-7-andrii@kernel.org --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8252452d0c4d..d0bf37e3f166 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1489,6 +1489,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif + struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; @@ -1631,6 +1632,8 @@ struct bpf_token { struct user_namespace *userns; u64 allowed_cmds; u64 allowed_maps; + u64 allowed_progs; + u64 allowed_attachs; }; struct bpf_struct_ops_value; @@ -2299,6 +2302,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); +bool bpf_token_allow_prog_type(const struct bpf_token *token, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); -- cgit v1.2.3 From bbc1d24724e110b86a1a7c3c1724ce0d62cc1e2e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:21:04 -0800 Subject: bpf: Take into account BPF token when fetching helper protos Instead of performing unconditional system-wide bpf_capable() and perfmon_capable() calls inside bpf_base_func_proto() function (and other similar ones) to determine eligibility of a given BPF helper for a given program, use previously recorded BPF token during BPF_PROG_LOAD command handling to inform the decision. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20240124022127.2379740-8-andrii@kernel.org --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0bf37e3f166..1325225bf602 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2550,7 +2550,8 @@ int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt, struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); @@ -2810,7 +2811,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log, } static inline const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } -- cgit v1.2.3 From d79a3549754725bb90e58104417449edddf3da3d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:21:05 -0800 Subject: bpf: Consistently use BPF token throughout BPF verifier logic Remove remaining direct queries to perfmon_capable() and bpf_capable() in BPF verifier logic and instead use BPF token (if available) to make decisions about privileges. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20240124022127.2379740-9-andrii@kernel.org --- include/linux/bpf.h | 16 ++++++++-------- include/linux/filter.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1325225bf602..4e146e9708be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2261,24 +2261,24 @@ extern int sysctl_unprivileged_bpf_disabled; bool bpf_token_capable(const struct bpf_token *token, int cap); -static inline bool bpf_allow_ptr_leaks(void) +static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_allow_uninit_stack(void) +static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v1(void) +static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v4(void) +static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } int bpf_map_new_fd(struct bpf_map *map, int flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index 35f067fd3840..fee070b9826e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1140,7 +1140,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && bpf_capable()) + if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) return false; return true; -- cgit v1.2.3 From 1b67772e4e3f16cd647b229cae95fc06d120be08 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:21:06 -0800 Subject: bpf,lsm: Refactor bpf_prog_alloc/bpf_prog_free LSM hooks Based on upstream discussion ([0]), rework existing bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF program struct. Also, we pass bpf_attr union with all the user-provided arguments for BPF_PROG_LOAD command. This will give LSMs as much information as we can basically provide. The hook is also BPF token-aware now, and optional bpf_token struct is passed as a third argument. bpf_prog_load LSM hook is called after a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were allocated and filled out, but right before performing full-fledged BPF verification step. bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for consistency. SELinux code is adjusted to all new names, types, and signatures. Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be used by some LSMs to allocate extra security blob, but also by other LSMs to reject BPF program loading, we need to make sure that bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one *even* if the hook itself returned error. If we don't do that, we run the risk of leaking memory. This seems to be possible today when combining SELinux and BPF LSM, as one example, depending on their relative ordering. Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to sleepable LSM hooks list, as they are both executed in sleepable context. Also drop bpf_prog_load hook from untrusted, as there is no issue with refcount or anything else anymore, that originally forced us to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM hook arguments as trusted"). We now trigger this hook much later and it should not be an issue anymore. [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/ Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Paul Moore Link: https://lore.kernel.org/bpf/20240124022127.2379740-10-andrii@kernel.org --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 12 +++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 185924c56378..370181aa685b 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -406,8 +406,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) -LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) -LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) +LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index d0eb20f90b26..cb2932fce448 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2064,15 +2064,16 @@ static inline void securityfs_remove(struct dentry *dentry) union bpf_attr; struct bpf_map; struct bpf_prog; -struct bpf_prog_aux; +struct bpf_token; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); extern int security_bpf_map_alloc(struct bpf_map *map); extern void security_bpf_map_free(struct bpf_map *map); -extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); -extern void security_bpf_prog_free(struct bpf_prog_aux *aux); +extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token); +extern void security_bpf_prog_free(struct bpf_prog *prog); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2098,12 +2099,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map) static inline void security_bpf_map_free(struct bpf_map *map) { } -static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) +static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) +static inline void security_bpf_prog_free(struct bpf_prog *prog) { } #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From a2431c7eabcf9bd5a1e7a1f7ecded40fdda4a8c5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:21:07 -0800 Subject: bpf,lsm: Refactor bpf_map_alloc/bpf_map_free LSM hooks Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc hook into bpf_map_create, taking not just struct bpf_map, but also bpf_attr and bpf_token, to give a fuller context to LSMs. Unlike bpf_prog_alloc, there is no need to move the hook around, as it currently is firing right before allocating BPF map ID and FD, which seems to be a sweet spot. But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free LSM hook is called even if bpf_map_create hook returned error, as if few LSMs are combined together it could be that one LSM successfully allocated security blob for its needs, while subsequent LSM rejected BPF map creation. The former LSM would still need to free up LSM blob, so we need to ensure security_bpf_map_free() is called regardless of the outcome. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Paul Moore Link: https://lore.kernel.org/bpf/20240124022127.2379740-11-andrii@kernel.org --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 370181aa685b..1be4d3ca6efb 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -404,8 +404,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) -LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) +LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) diff --git a/include/linux/security.h b/include/linux/security.h index cb2932fce448..83fcdc974116 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2069,7 +2069,8 @@ struct bpf_token; extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); -extern int security_bpf_map_alloc(struct bpf_map *map); +extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token); extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); @@ -2091,7 +2092,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog) return 0; } -static inline int security_bpf_map_alloc(struct bpf_map *map) +static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -- cgit v1.2.3 From f568a3d49af9aed813a184353592efe29b0e3d16 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Jan 2024 18:21:08 -0800 Subject: bpf,lsm: Add BPF token LSM hooks Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to allocate LSM security blob (we add `void *security` field to struct bpf_token for that), but also control who can instantiate BPF token. This follows existing pattern for BPF map and BPF prog. Also add security_bpf_token_allow_cmd() and security_bpf_token_capable() LSM hooks that allow LSM implementation to control and negate (if necessary) BPF token's delegation of a specific bpf_cmd and capability, respectively. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Paul Moore Link: https://lore.kernel.org/bpf/20240124022127.2379740-12-andrii@kernel.org --- include/linux/bpf.h | 3 +++ include/linux/lsm_hook_defs.h | 5 +++++ include/linux/security.h | 25 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e146e9708be..b86bd15a051d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1634,6 +1634,9 @@ struct bpf_token { u64 allowed_maps; u64 allowed_progs; u64 allowed_attachs; +#ifdef CONFIG_SECURITY + void *security; +#endif }; struct bpf_struct_ops_value; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 1be4d3ca6efb..cd6fbc7af3f8 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -410,6 +410,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) +LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, + struct path *path) +LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) +LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) +LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 83fcdc974116..15804af54f37 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include #include #include +#include #include struct linux_binprm; @@ -2075,6 +2076,11 @@ extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); extern void security_bpf_prog_free(struct bpf_prog *prog); +extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path); +extern void security_bpf_token_free(struct bpf_token *token); +extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +extern int security_bpf_token_capable(const struct bpf_token *token, int cap); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2109,6 +2115,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr * static inline void security_bpf_prog_free(struct bpf_prog *prog) { } + +static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path) +{ + return 0; +} + +static inline void security_bpf_token_free(struct bpf_token *token) +{ } + +static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + return 0; +} + +static inline int security_bpf_token_capable(const struct bpf_token *token, int cap) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From d8f899d13d72d285db43dbb9df1acaed22d8c4e7 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:55 +0800 Subject: fs: make the i_size_read/write helpers be smp_load_acquire/store_release() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In [Link] Linus mentions that acquire/release makes it clear which _particular_ memory accesses are the ordered ones, and it's unlikely to make any performance difference, so it's much better to pair up the release->acquire ordering than have a "wmb->rmb" ordering. ========================================================= update pagecache folio_mark_uptodate(folio) smp_wmb() set_bit PG_uptodate === ↑↑↑ STLR ↑↑↑ === smp_store_release(&inode->i_size, i_size) folio_test_uptodate(folio) test_bit PG_uptodate smp_rmb() === ↓↓↓ LDAR ↓↓↓ === smp_load_acquire(&inode->i_size) copy_page_to_iter() ========================================================= Calling smp_store_release() in i_size_write() ensures that the data in the page and the PG_uptodate bit are updated before the isize is updated, and calling smp_load_acquire() in i_size_read ensures that it will not read a newer isize than the data in the page. Therefore, this avoids buffered read-write inconsistencies caused by Load-Load reordering. Link: https://lore.kernel.org/r/CAHk-=wifOnmeJq+sn+2s-P46zw0SFEbw9BSCGgp2c5fYPtRPGw@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-2-libaokun1@huawei.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6669147b9e..ebce4763b4bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -907,7 +907,8 @@ static inline loff_t i_size_read(const struct inode *inode) preempt_enable(); return i_size; #else - return inode->i_size; + /* Pairs with smp_store_release() in i_size_write() */ + return smp_load_acquire(&inode->i_size); #endif } @@ -929,7 +930,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size) inode->i_size = i_size; preempt_enable(); #else - inode->i_size = i_size; + /* + * Pairs with smp_load_acquire() in i_size_read() to ensure + * changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); #endif } -- cgit v1.2.3 From 83e897a961b801536dd1d736e9ede5b1ddb1c188 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 2 Jan 2024 21:35:33 +0200 Subject: wifi: ieee80211: add definitions for negotiated TID to Link map Add the relevant definitions and structures for TID to Link mapping negotiation request/response/teardown according to P802.11be_D4.0. Signed-off-by: Ayala Beker Reviewed-by: Gregory Greenman Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240102213313.9ef2b866c8c7.Ieaf7dadea9961e0edc55d19c99f0f9fbae591de6@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 83c4d060a559..eafa70e5ba94 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1454,6 +1454,20 @@ struct ieee80211_mgmt { u8 max_tod_error; u8 max_toa_error; } __packed wnm_timing_msr; + struct { + u8 action_code; + u8 dialog_token; + u8 variable[]; + } __packed ttlm_req; + struct { + u8 action_code; + u8 dialog_token; + u8 status_code; + u8 variable[]; + } __packed ttlm_res; + struct { + u8 action_code; + } __packed ttlm_tear_down; } u; } __packed action; DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */ @@ -3357,6 +3371,8 @@ enum ieee80211_statuscode { WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 109, WLAN_STATUS_SAE_HASH_TO_ELEMENT = 126, WLAN_STATUS_SAE_PK = 127, + WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING = 133, + WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED = 134, }; @@ -3682,6 +3698,7 @@ enum ieee80211_category { WLAN_CATEGORY_UNPROT_DMG = 20, WLAN_CATEGORY_VHT = 21, WLAN_CATEGORY_S1G = 22, + WLAN_CATEGORY_PROTECTED_EHT = 37, WLAN_CATEGORY_VENDOR_SPECIFIC_PROTECTED = 126, WLAN_CATEGORY_VENDOR_SPECIFIC = 127, }; @@ -3745,6 +3762,13 @@ enum ieee80211_unprotected_wnm_actioncode { WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1, }; +/* Protected EHT action codes */ +enum ieee80211_protected_eht_actioncode { + WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0, + WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1, + WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2, +}; + /* Security key length */ enum ieee80211_key_len { WLAN_KEY_LEN_WEP40 = 5, @@ -4845,6 +4869,10 @@ struct ieee80211_multi_link_elem { #define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS 0x000f #define IEEE80211_MLD_CAP_OP_SRS_SUPPORT 0x0010 #define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP 0x0060 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP 0 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME 1 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED 2 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF 3 #define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND 0x0f80 #define IEEE80211_MLD_CAP_OP_AAR_SUPPORT 0x1000 -- cgit v1.2.3 From f7660b3f584aadd25dde18aa1902488577a15863 Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Tue, 2 Jan 2024 21:35:37 +0200 Subject: wifi: mac80211: add support for negotiated TTLM request Update neg_ttlm and active_links according to the new mapping, and send a negotiated TID-to-link map request with the new mapping. Signed-off-by: Ayala Beker Reviewed-by: Gregory Greenman Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240102213313.eeb385d771df.I2a5441c14421de884dbd93d1624ce7bb2c944833@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index eafa70e5ba94..f0c068446c79 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -5024,6 +5024,43 @@ static inline u16 ieee80211_mle_get_eml_cap(const u8 *data) return get_unaligned_le16(common); } +/** + * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations. + * @data: pointer to the multi link EHT IE + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + * + * If the MLD capabilities and operations field is not present, 0 will be + * returned. + */ +static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + + return get_unaligned_le16(common); +} + /** * ieee80211_mle_size_ok - validate multi-link element size * @data: pointer to the element data -- cgit v1.2.3 From ccb964b4ab1663ce92f389b72c052fc47a0ffdb9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jan 2024 21:35:44 +0200 Subject: wifi: cfg80211: validate MLO connections better When going into an MLO connection, validate that the link IDs match what userspace indicated, and that the AP MLD addresses and capabilities are all matching between the links. Signed-off-by: Johannes Berg Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20240102213313.ff83c034cb9a.I9962db0bfa8c73b37b8d5b59a3fad7f02f2129ae@changeid [roll in extra fix from Miri to actually check the return value] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f0c068446c79..a70388ae3a7b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4935,6 +4935,30 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) return sizeof(*mle) + common + mle->variable[0]; } +/** + * ieee80211_mle_get_link_id - returns the link ID + * @data: the basic multi link element + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + * + * If the BSS link ID can't be found, -1 will be returned + */ +static inline int ieee80211_mle_get_link_id(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID)) + return -1; + + return *common; +} + /** * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count * @mle: the basic multi link element -- cgit v1.2.3 From d12a82848eac28d248e67940378fe4a72b0a8cd3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 22 Jan 2024 13:42:40 +0100 Subject: bitmap: Define a cleanup function for bitmaps Add support for autopointers for bitmaps allocated with bitmap_alloc() et al. Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Reviewed-by: Andy Shevchenko Acked-by: Yury Norov Link: https://lore.kernel.org/r/20240122124243.44002-2-brgl@bgdev.pl --- include/linux/bitmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 99451431e4d6..df24c8fb1009 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -127,6 +128,8 @@ unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node); void bitmap_free(const unsigned long *bitmap); +DEFINE_FREE(bitmap, unsigned long *, if (_T) bitmap_free(_T)) + /* Managed variants of the above. */ unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags); -- cgit v1.2.3 From e563d0a7cdc1890ff36bb177b5c8c2854d881e4d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 26 Jan 2024 11:55:50 -1000 Subject: workqueue: Break up enum definitions and give names to the types workqueue is collecting different sorts of enums into a single unnamed enum type which can increase confusion around enum width. Also, unnamed enums can't be accessed from BPF. Let's break up enum definitions according to their purposes and give them type names. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 2cc0a9606175..78047d0d9882 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -22,7 +22,7 @@ */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) -enum { +enum work_bits { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ @@ -36,21 +36,6 @@ enum { WORK_STRUCT_COLOR_BITS = 4, - WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, - WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, - WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, - WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, -#ifdef CONFIG_DEBUG_OBJECTS_WORK - WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, -#else - WORK_STRUCT_STATIC = 0, -#endif - - WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), - - /* not bound to any CPU, prefer the local CPU */ - WORK_CPU_UNBOUND = NR_CPUS, - /* * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 16 workqueue @@ -74,6 +59,26 @@ enum { WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, +}; + +enum work_flags { + WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, + WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, + WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, + WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, +#ifdef CONFIG_DEBUG_OBJECTS_WORK + WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, +#else + WORK_STRUCT_STATIC = 0, +#endif +}; + +enum wq_misc_consts { + WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), + + /* not bound to any CPU, prefer the local CPU */ + WORK_CPU_UNBOUND = NR_CPUS, + /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, @@ -347,7 +352,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ -enum { +enum wq_flags { WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ @@ -387,7 +392,9 @@ enum { __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ +}; +enum wq_consts { WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, -- cgit v1.2.3 From 73697f0acc773a357946a3c5a917bfb4c85128a3 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Tue, 23 Jan 2024 09:09:11 -0600 Subject: power: supply: bq27xxx: Add devm action to free IDA Use a device lifecycle managed action to free the IDA. This helps prevent mistakes like freeing out of order in cleanup functions and forgetting to free on error paths. Signed-off-by: Andrew Davis Link: https://lore.kernel.org/r/20240123150914.308510-2-afd@ti.com Signed-off-by: Sebastian Reichel --- include/linux/power/bq27xxx_battery.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index 7d8025fb74b7..b9e5bd2b42d3 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -61,7 +61,6 @@ struct bq27xxx_reg_cache { struct bq27xxx_device_info { struct device *dev; - int id; enum bq27xxx_chip chip; u32 opts; const char *name; -- cgit v1.2.3 From 7ae215ee7bb855f13c80565470fc7f67db4ba82f Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 25 Jan 2024 21:36:59 +0100 Subject: net: phy: add support for PHY LEDs polarity modes Add support for PHY LEDs polarity modes. Some PHY require LED to be set to active low to be turned ON. Adds support for this by declaring active-low property in DT. PHY driver needs to declare .led_polarity_set() to configure LED polarity modes. Function will pass the index with the LED index and a bitmap with all the required modes to set. Current supported modes are: - active-low with the flag PHY_LED_ACTIVE_LOW. LED is set to active-low to turn it ON. - inactive-high-impedance with the flag PHY_LED_INACTIVE_HIGH_IMPEDANCE. LED is set to high impedance to turn it OFF. Signed-off-by: Christian Marangi Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240125203702.4552-4-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 684efaeca07c..c9994a59ca2e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -852,6 +852,15 @@ struct phy_plca_status { bool pst; }; +/* Modes for PHY LED configuration */ +enum phy_led_modes { + PHY_LED_ACTIVE_LOW = 0, + PHY_LED_INACTIVE_HIGH_IMPEDANCE = 1, + + /* keep it last */ + __PHY_LED_MODES_NUM, +}; + /** * struct phy_led: An LED driven by the PHY * @@ -1145,6 +1154,19 @@ struct phy_driver { int (*led_hw_control_get)(struct phy_device *dev, u8 index, unsigned long *rules); + /** + * @led_polarity_set: Set the LED polarity modes + * @dev: PHY device which has the LED + * @index: Which LED of the PHY device + * @modes: bitmap of LED polarity modes + * + * Configure LED with all the required polarity modes in @modes + * to make it correctly turn ON or OFF. + * + * Returns 0, or an error code. + */ + int (*led_polarity_set)(struct phy_device *dev, int index, + unsigned long modes); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) -- cgit v1.2.3 From 4ca79255101b9edd6ac874d39361fd2b52927af0 Mon Sep 17 00:00:00 2001 From: Alexander Tsoy Date: Fri, 26 Jan 2024 00:16:35 +0300 Subject: usb: audio-v2: Correct comments for struct uac_clock_selector_descriptor This is likely a copy-paste error. Replace "Source" with "Selector" where appropriate. Fixes: 7e847894039d7 ("linux/usb/audio.h: split header") Signed-off-by: Alexander Tsoy Link: https://lore.kernel.org/r/20240125211635.30140-1-alexander@tsoy.me Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/audio-v2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h index ca796dc1a984..6e5555610010 100644 --- a/include/linux/usb/audio-v2.h +++ b/include/linux/usb/audio-v2.h @@ -82,7 +82,7 @@ struct uac_clock_source_descriptor { #define UAC_CLOCK_SOURCE_TYPE_INT_PROG 0x3 #define UAC_CLOCK_SOURCE_SYNCED_TO_SOF (1 << 2) -/* 4.7.2.2 Clock Source Descriptor */ +/* 4.7.2.2 Clock Selector Descriptor */ struct uac_clock_selector_descriptor { __u8 bLength; @@ -91,7 +91,7 @@ struct uac_clock_selector_descriptor { __u8 bClockID; __u8 bNrInPins; __u8 baCSourceID[]; - /* bmControls and iClockSource omitted */ + /* bmControls and iClockSelector omitted */ } __attribute__((packed)); /* 4.7.2.3 Clock Multiplier Descriptor */ -- cgit v1.2.3 From 5de5f1e292e56fe4b8d28923d325f4c16f3766cf Mon Sep 17 00:00:00 2001 From: Stanley Chang Date: Wed, 13 Dec 2023 11:10:06 +0800 Subject: phy: core: add notify_connect and notify_disconnect callback In Realtek SoC, the parameter of usb phy is designed to be able to do dynamic tuning based in the port status. Therefore, add a notify callback of phy driver when usb connection/disconnection change. Signed-off-by: Stanley Chang Link: https://lore.kernel.org/r/20231213031203.4911-1-stanley_chang@realtek.com Signed-off-by: Greg Kroah-Hartman --- include/linux/phy/phy.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index f6d607ef0e80..aa76609ba258 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -122,6 +122,11 @@ struct phy_ops { union phy_configure_opts *opts); int (*reset)(struct phy *phy); int (*calibrate)(struct phy *phy); + + /* notify phy connect status change */ + int (*connect)(struct phy *phy, int port); + int (*disconnect)(struct phy *phy, int port); + void (*release)(struct phy *phy); struct module *owner; }; @@ -243,6 +248,8 @@ static inline enum phy_mode phy_get_mode(struct phy *phy) } int phy_reset(struct phy *phy); int phy_calibrate(struct phy *phy); +int phy_notify_connect(struct phy *phy, int port); +int phy_notify_disconnect(struct phy *phy, int port); static inline int phy_get_bus_width(struct phy *phy) { return phy->attrs.bus_width; @@ -396,6 +403,20 @@ static inline int phy_calibrate(struct phy *phy) return -ENOSYS; } +static inline int phy_notify_connect(struct phy *phy, int index) +{ + if (!phy) + return 0; + return -ENOSYS; +} + +static inline int phy_notify_disconnect(struct phy *phy, int index) +{ + if (!phy) + return 0; + return -ENOSYS; +} + static inline int phy_configure(struct phy *phy, union phy_configure_opts *opts) { -- cgit v1.2.3 From 7494d4bc8e32a9480fd56b018db8e404b54b24e6 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Mon, 8 Jan 2024 19:16:14 +0000 Subject: usb: typec: altmodes: add typec_cable_ops to typec_altmode Add typec_cable_ops struct for enter, exit, and vdm. The struct is added to typec_altmode so port alt modes can have access to partner and cable specific callbacks, and alt mode drivers can specify operations over SOP' and SOP'' without modifying the existing API. typec_port_register_cable_ops is added as a new symbol for port drivers to use to register cable operations to their registered port alt modes. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240108191620.987785-15-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 4 ++++ include/linux/usb/typec_altmode.h | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index a05d6f6f2536..38f93d72fd1b 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -18,6 +18,7 @@ struct typec_cable; struct typec_plug; struct typec_port; struct typec_altmode_ops; +struct typec_cable_ops; struct fwnode_handle; struct device; @@ -157,6 +158,9 @@ void typec_port_register_altmodes(struct typec_port *port, const struct typec_altmode_ops *ops, void *drvdata, struct typec_altmode **altmodes, size_t n); +void typec_port_register_cable_ops(struct typec_altmode **altmodes, int max_altmodes, + const struct typec_cable_ops *ops); + void typec_unregister_altmode(struct typec_altmode *altmode); struct typec_port *typec_altmode2port(struct typec_altmode *alt); diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index 28aeef8f9e7b..72ec8058543a 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -20,6 +20,7 @@ struct typec_altmode_ops; * @active: Tells has the mode been entered or not * @desc: Optional human readable description of the mode * @ops: Operations vector from the driver + * @cable_ops: Cable operations vector from the driver. */ struct typec_altmode { struct device dev; @@ -30,6 +31,7 @@ struct typec_altmode { char *desc; const struct typec_altmode_ops *ops; + const struct typec_cable_ops *cable_ops; }; #define to_typec_altmode(d) container_of(d, struct typec_altmode, dev) @@ -75,6 +77,24 @@ int typec_altmode_notify(struct typec_altmode *altmode, unsigned long conf, const struct typec_altmode * typec_altmode_get_partner(struct typec_altmode *altmode); +/** + * struct typec_cable_ops - Cable alternate mode operations vector + * @enter: Operations to be executed with Enter Mode Command + * @exit: Operations to be executed with Exit Mode Command + * @vdm: Callback for SVID specific commands + */ +struct typec_cable_ops { + int (*enter)(struct typec_altmode *altmode, enum typec_plug_index sop, u32 *vdo); + int (*exit)(struct typec_altmode *altmode, enum typec_plug_index sop); + int (*vdm)(struct typec_altmode *altmode, enum typec_plug_index sop, + const u32 hdr, const u32 *vdo, int cnt); +}; + +int typec_cable_altmode_enter(struct typec_altmode *altmode, enum typec_plug_index sop, u32 *vdo); +int typec_cable_altmode_exit(struct typec_altmode *altmode, enum typec_plug_index sop); +int typec_cable_altmode_vdm(struct typec_altmode *altmode, enum typec_plug_index sop, + const u32 header, const u32 *vdo, int count); + /* * These are the connector states (USB, Safe and Alt Mode) defined in USB Type-C * Specification. SVID specific connector states are expected to follow and -- cgit v1.2.3 From 231b7318413cef0f8e5c2ca8db1a95b666c25d70 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Mon, 8 Jan 2024 19:16:15 +0000 Subject: usb: typec: altmodes: add svdm version info for typec cables Add typec_cable_set_svdm_version and typec_get_cable_svdm version symbols. Cables can operate under a lower PD revision than the port partner, and the max SVDM version is tied to the PD revision. So, typec_cable maintains its own svdm_version. Add typec_altmode_get_cable_svdm_version to return the cable's negotiated svdm_version for altmode drivers to use. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240108191620.987785-16-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 3 +++ include/linux/usb/typec_altmode.h | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index 38f93d72fd1b..b35b427561ab 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -337,6 +337,9 @@ void typec_partner_set_svdm_version(struct typec_partner *partner, enum usb_pd_svdm_ver svdm_version); int typec_get_negotiated_svdm_version(struct typec_port *port); +int typec_get_cable_svdm_version(struct typec_port *port); +void typec_cable_set_svdm_version(struct typec_cable *cable, enum usb_pd_svdm_ver svdm_version); + struct usb_power_delivery *typec_partner_usb_power_delivery_register(struct typec_partner *partner, struct usb_power_delivery_desc *desc); diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index 72ec8058543a..b3c0866ea70f 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -95,6 +95,16 @@ int typec_cable_altmode_exit(struct typec_altmode *altmode, enum typec_plug_inde int typec_cable_altmode_vdm(struct typec_altmode *altmode, enum typec_plug_index sop, const u32 header, const u32 *vdo, int count); +/** + * typec_altmode_get_cable_svdm_version - Get negotiated SVDM version for cable plug + * @altmode: Handle to the alternate mode + */ +static inline int +typec_altmode_get_cable_svdm_version(struct typec_altmode *altmode) +{ + return typec_get_cable_svdm_version(typec_altmode2port(altmode)); +} + /* * These are the connector states (USB, Safe and Alt Mode) defined in USB Type-C * Specification. SVID specific connector states are expected to follow and -- cgit v1.2.3 From 59cd27a0cab1ceddcc4251309fd3643921ed9ab9 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Mon, 8 Jan 2024 19:16:16 +0000 Subject: usb: typec: tcpci: add cable_comm_capable attribute Add cable_comm_capable to tcpci_data for tcpci drivers to indicate that the port tcpc is capable of communicating to cables over SOP. A corresponding tcpci callback tcpci_cable_comm_capable returns this value. The tcpm will primarily use this in later patches to determine if the port can transmit and receive SOP' messages. Maxim based tcpci drivers are capable of SOP' communication, so the cable_comm_capable flag is set to true. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240108191620.987785-17-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 3 +++ include/linux/usb/tcpm.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 467e8045e9f8..1d0b849defd0 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -198,12 +198,15 @@ struct tcpci; * Chip level drivers are expected to check for contaminant and call * tcpm_clean_port when the port is clean to put the port back into * toggling state. + * @cable_comm_capable + * optional; Set when TCPC can communicate with cable plugs over SOP' */ struct tcpci_data { struct regmap *regmap; unsigned char TX_BUF_BYTE_x_hidden:1; unsigned char auto_discharge_disconnect:1; unsigned char vbus_vsafe0v:1; + unsigned char cable_comm_capable:1; int (*init)(struct tcpci *tcpci, struct tcpci_data *data); int (*set_vconn)(struct tcpci *tcpci, struct tcpci_data *data, diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index 65fac5e1f317..430fa3ec69bb 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -119,6 +119,9 @@ enum tcpm_transmit_type { * at the end of the deboumce period or when the port is still * toggling. Chip level drivers are expected to check for contaminant * and call tcpm_clean_port when the port is clean. + * @cable_comm_capable + * Optional; Returns whether cable communication over SOP' is supported + * by the tcpc */ struct tcpc_dev { struct fwnode_handle *fwnode; @@ -154,6 +157,7 @@ struct tcpc_dev { bool (*is_vbus_vsafe0v)(struct tcpc_dev *dev); void (*set_partner_usb_comm_capable)(struct tcpc_dev *dev, bool enable); void (*check_contaminant)(struct tcpc_dev *dev); + bool (*cable_comm_capable)(struct tcpc_dev *dev); }; struct tcpm_port; -- cgit v1.2.3 From 3bbb9ba4f66006f27ad0d5ceaf2480117e16d489 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Mon, 8 Jan 2024 19:16:17 +0000 Subject: usb: typec: tcpci: add tcpm_transmit_type to tcpm_pd_receive tcpm_pd_receive adds the SOP type as a parameter, and passes it within the pd_rx_event struct for tcpm_pd_rx_handler to use. For now, the handler drops all SOP' messages. Maxim based tcpci drivers are capable of SOP' communication, so process_rx now takes the SOP type into account and passes the value to tcpm_pd_receive. tcpci_set_pd_rx now utilizes the cable_comm_capable flag to determine if TCPC_RX_DETECT_SOP1 should be added to the bitfield when enabling PD message reception. For all other consumers of tcpm_pd_receive, default the new field to TCPC_TX_SOP. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240108191620.987785-18-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 1 + include/linux/usb/tcpm.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 1d0b849defd0..9ed6d62c9c5f 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -145,6 +145,7 @@ #define TCPC_RX_BYTE_CNT 0x30 #define TCPC_RX_BUF_FRAME_TYPE 0x31 #define TCPC_RX_BUF_FRAME_TYPE_SOP 0 +#define TCPC_RX_BUF_FRAME_TYPE_SOP1 1 #define TCPC_RX_HDR 0x32 #define TCPC_RX_DATA 0x34 /* through 0x4f */ diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index 430fa3ec69bb..41d1ac9c8bbf 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -170,7 +170,8 @@ void tcpm_cc_change(struct tcpm_port *port); void tcpm_sink_frs(struct tcpm_port *port); void tcpm_sourcing_vbus(struct tcpm_port *port); void tcpm_pd_receive(struct tcpm_port *port, - const struct pd_message *msg); + const struct pd_message *msg, + enum tcpm_transmit_type rx_sop_type); void tcpm_pd_transmit_complete(struct tcpm_port *port, enum tcpm_transmit_status status); void tcpm_pd_hard_reset(struct tcpm_port *port); -- cgit v1.2.3 From 6bd181ba60e198fef6f750b543832f161fbd9f39 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Mon, 8 Jan 2024 19:16:19 +0000 Subject: usb: typec: tcpm: add control message support to sop' Add tx_sop_type to tcpm_pd_send_control and rx_sop_type to tcpm_pd_ctrl_request. TCPC_TX_SOP is added to all pd_send_control calls, but TCPC_TX_SOP_PRIME is added to pd_send_control for a SOFT_RESET message sent after a Vconn swap that makes the Port the Vconn source. Likewise, tcpm_pd_ctrl_request resets the proper protocol layer depending on rx_sop_type for SOFT_RESET. VCONN_SWAP_TURN_ON_VCONN now moves to a new state, VCONN_SWAP_SEND_SOFT_RESET. This state sends SOFT_RESET over SOP' before transitioning to the ready state if applicable. It transitions after PD_T_VCONN_STABLE, definied in pd.h as the time required for Vconn to be on before transmitting messages. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240108191620.987785-20-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index eb626af0e4e7..d50098fb16b5 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -483,6 +483,7 @@ static inline unsigned int rdo_max_power(u32 rdo) #define PD_T_BIST_CONT_MODE 50 /* 30 - 60 ms */ #define PD_T_SINK_TX 16 /* 16 - 20 ms */ #define PD_T_CHUNK_NOT_SUPP 42 /* 40 - 50 ms */ +#define PD_T_VCONN_STABLE 50 #define PD_T_DRP_TRY 100 /* 75 - 150 ms */ #define PD_T_DRP_TRYWAIT 600 /* 400 - 800 ms */ -- cgit v1.2.3 From 030509ac473da439e3d5438b1cd3c5b899844046 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Mon, 8 Jan 2024 19:16:20 +0000 Subject: usb: typec: tcpci: add attempt_vconn_swap_discovery callback Add attempt_vconn_swap_discovery callback to determine whether the TCPM should perform a Vconn swap following Discover Identity on SOP. The tcpci will return false unless chip level drivers implement the callback. Maxim based TCPCs will return true unless the last connection resulted in a Vconn Over Current Fault, which may be the result of the Vconn swap. In addition to the port resetting, the TCPCI will veto the next Vconn swap from occurring. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240108191620.987785-21-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 9 +++++++++ include/linux/usb/tcpm.h | 9 +++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 9ed6d62c9c5f..47a86b8a4a50 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -201,6 +201,14 @@ struct tcpci; * toggling state. * @cable_comm_capable * optional; Set when TCPC can communicate with cable plugs over SOP' + * @attempt_vconn_swap_discovery: + * Optional; The callback is called by the TCPM when the result of + * a Discover Identity request indicates that the port partner is + * a receptacle capable of modal operation. Chip level TCPCI drivers + * can implement their own policy to determine if and when a Vconn + * swap following Discover Identity on SOP' occurs. + * Return true when the TCPM is allowed to request a Vconn swap + * after Discovery Identity on SOP. */ struct tcpci_data { struct regmap *regmap; @@ -219,6 +227,7 @@ struct tcpci_data { void (*set_partner_usb_comm_capable)(struct tcpci *tcpci, struct tcpci_data *data, bool capable); void (*check_contaminant)(struct tcpci *tcpci, struct tcpci_data *data); + bool (*attempt_vconn_swap_discovery)(struct tcpci *tcpci, struct tcpci_data *data); }; struct tcpci *tcpci_register_port(struct device *dev, struct tcpci_data *data); diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index 41d1ac9c8bbf..6671427f7eeb 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -122,6 +122,14 @@ enum tcpm_transmit_type { * @cable_comm_capable * Optional; Returns whether cable communication over SOP' is supported * by the tcpc + * @attempt_vconn_swap_discovery: + * Optional; The callback is called by the TCPM when the result of + * a Discover Identity request indicates that the port partner is + * a receptacle capable of modal operation. Chip level TCPCI drivers + * can implement their own policy to determine if and when a Vconn + * swap following Discover Identity on SOP' occurs. + * Return true when the TCPM is allowed to request a Vconn swap + * after Discovery Identity on SOP. */ struct tcpc_dev { struct fwnode_handle *fwnode; @@ -158,6 +166,7 @@ struct tcpc_dev { void (*set_partner_usb_comm_capable)(struct tcpc_dev *dev, bool enable); void (*check_contaminant)(struct tcpc_dev *dev); bool (*cable_comm_capable)(struct tcpc_dev *dev); + bool (*attempt_vconn_swap_discovery)(struct tcpc_dev *dev); }; struct tcpm_port; -- cgit v1.2.3 From fb7ff25ae43332cb64c9e7bbbe36a6cc308d8de1 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Mon, 8 Jan 2024 19:16:21 +0000 Subject: usb: typec: tcpm: add discover identity support for SOP' Add data message handling and Discover Identity SVDM over SOP' This patch contains the following changes: 1. pd_vdo Add VDO indices for active and passive cables, documentation to reflect expected number of objects depending on PD Revision, and macro to indicate port parter is data host capable. 2. tcpm Add typec_cable and typec_plug to tcpm_port to maintain cable and plug information. tcpm_port also adds send_discover_prime to indicate that Discover Identity should be sent out of the ready state. tcpm_queue_vdm and tcpm_send_vdm now take the SOP* type when transmitting messages. tcpm_handle_vdm_request and tcpm_pd_svdm also use the SOP* type. tcpm_pd_svdm handles Discover Identity messages for SOP and SOP'. In the SOP case, the port uses tcpm_attempt_vconn_swap_discovery to determine if a Vconn swap is needed for cable communication. Otherwise, the port will send Discover Identity on SOP' if it can, or default to Discover SVIDs. svdm_consume_identity_sop_prime consumes the result of Discover Identity on SOP'. It fills out cable identity and description, and it registers the cable. The SOP' plug is registered as well. The VDM state machine is adjusted to construct messages based on the SOP* type. If a transmission error occurs after the max number of retries for Discover Identity over SOP', then the port will send Discover SVIDs over SOP. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240108191620.987785-22-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd_vdo.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h index 3a747938cdab..c09c5a12e273 100644 --- a/include/linux/usb/pd_vdo.h +++ b/include/linux/usb/pd_vdo.h @@ -86,12 +86,15 @@ * * Request is simply properly formatted SVDM header * - * Response is 4 data objects: + * Response is 4 data objects for Power Delivery 2.0 and Passive Cables for + * Power Delivery 3.0. Active Cables in Power Delivery 3.0 have 5 data objects. * [0] :: SVDM header * [1] :: Identitiy header * [2] :: Cert Stat VDO * [3] :: (Product | Cable) VDO + * [4] :: Cable VDO 1 * [4] :: AMA VDO + * [5] :: Cable VDO 2 * */ #define VDO_INDEX_HDR 0 @@ -100,6 +103,8 @@ #define VDO_INDEX_CABLE 3 #define VDO_INDEX_PRODUCT 3 #define VDO_INDEX_AMA 4 +#define VDO_INDEX_CABLE_1 4 +#define VDO_INDEX_CABLE_2 5 /* * SVDM Identity Header @@ -150,6 +155,7 @@ #define PD_IDH_MODAL_SUPP(vdo) ((vdo) & (1 << 26)) #define PD_IDH_DFP_PTYPE(vdo) (((vdo) >> 23) & 0x7) #define PD_IDH_CONN_TYPE(vdo) (((vdo) >> 21) & 0x3) +#define PD_IDH_HOST_SUPP(vdo) ((vdo) & (1 << 31)) /* * Cert Stat VDO -- cgit v1.2.3 From 174657478cd8425288aeabf93b964b9387e096fa Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:15 +0100 Subject: vgacon: inline vc_scrolldelta_helper() into vgacon_scrolldelta() Since commit 74d58cd48a8f ("USB: sisusbvga: remove console support"), vgacon_scrolldelta() is the only user of vc_scrolldelta_helper(). Inline the helper into vgacon_scrolldelta() and drop it. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-2-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/vt_kern.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h index c1f5aebef170..a789ea3ed2a0 100644 --- a/include/linux/vt_kern.h +++ b/include/linux/vt_kern.h @@ -168,7 +168,4 @@ void vt_set_led_state(unsigned int console, int leds); void vt_kbd_con_start(unsigned int console); void vt_kbd_con_stop(unsigned int console); -void vc_scrolldelta_helper(struct vc_data *c, int lines, - unsigned int rolled_over, void *_base, unsigned int size); - #endif /* _VT_KERN_H */ -- cgit v1.2.3 From a0b8a1681254346010edd2f94e799fb6b6568cf1 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:19 +0100 Subject: tty: vt: pass proper pointers from tioclinux() Pass proper types and proper pointers (the data with an offset) to the TIOCL_* handlers. So that they need not to cast or add anything to the passed pointer. This makes obvious what is passed/consumed. Signed-off-by: "Jiri Slaby (SUSE)" Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-6-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/selection.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/selection.h b/include/linux/selection.h index 170ef28ff26b..b7cd23e56a2b 100644 --- a/include/linux/selection.h +++ b/include/linux/selection.h @@ -20,7 +20,7 @@ extern int set_selection_user(const struct tiocl_selection __user *sel, extern int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty); extern int paste_selection(struct tty_struct *tty); -extern int sel_loadlut(char __user *p); +extern int sel_loadlut(u32 __user *lut); extern int mouse_reporting(void); extern void mouse_report(struct tty_struct * tty, int butt, int mrx, int mry); -- cgit v1.2.3 From beccdcfa15666c442ce79a5f963fcb34ec28084e Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:21 +0100 Subject: tty: vt: pass vc_resize_user as a parameter It is pretty unfortunate to set vc_data::vc_resize_user in two callers of vc_do_resize(). vc_resize_user is immediately reset there (while remembering it). So instead of this back and forth, pass 'from_user' as a parameter. Notes on 'int user': * The name changes from 'user' to 'from_user' on some places to be consistent. * The type is bool now as 'int user' might evoke user's uid or whatever. Provided vc_resize() is called on many places and they need not to care about this parameter, its prototype is kept unchanged. Instead, it is now an inline calling a new __vc_resize() which implements the above. This patch makes the situation much more obvious. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-8-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 2 +- include/linux/console_struct.h | 1 - include/linux/vt_kern.h | 9 ++++++++- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 779d388af8a0..38b379d6c624 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -66,7 +66,7 @@ struct consw { int (*con_font_default)(struct vc_data *vc, struct console_font *font, char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, - unsigned int height, unsigned int user); + unsigned int height, bool from_user); void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 539f1cd45309..20f564e98552 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -151,7 +151,6 @@ struct vc_data { DECLARE_BITMAP(vc_tab_stop, VC_TABSTOPS_COUNT); /* Tab stops. 256 columns. */ unsigned char vc_palette[16*3]; /* Colour palette for VGA+ */ unsigned short * vc_translate; - unsigned int vc_resize_user; /* resize request from user */ unsigned int vc_bell_pitch; /* Console bell pitch */ unsigned int vc_bell_duration; /* Console bell duration */ unsigned short vc_cur_blink_ms; /* Cursor blink duration */ diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h index a789ea3ed2a0..d008c3d0a9bb 100644 --- a/include/linux/vt_kern.h +++ b/include/linux/vt_kern.h @@ -25,7 +25,8 @@ extern int fg_console, last_console, want_console; int vc_allocate(unsigned int console); int vc_cons_allocated(unsigned int console); -int vc_resize(struct vc_data *vc, unsigned int cols, unsigned int lines); +int __vc_resize(struct vc_data *vc, unsigned int cols, unsigned int lines, + bool from_user); struct vc_data *vc_deallocate(unsigned int console); void reset_palette(struct vc_data *vc); void do_blank_screen(int entering_gfx); @@ -42,6 +43,12 @@ void redraw_screen(struct vc_data *vc, int is_switch); #define update_screen(x) redraw_screen(x, 0) #define switch_screen(x) redraw_screen(x, 1) +static inline int vc_resize(struct vc_data *vc, unsigned int cols, + unsigned int lines) +{ + return __vc_resize(vc, cols, lines, false); +} + struct tty_struct; int tioclinux(struct tty_struct *tty, unsigned long arg); -- cgit v1.2.3 From d4c0c481e49fdf483c43e13e4a419ea19c045023 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:22 +0100 Subject: tty: vt: make vc_is_sel()'s vc const It's only an aid to people reading the header and/or calling vc_is_sel(). vc is only tested there, so having it const makes sense. Signed-off-by: "Jiri Slaby (SUSE)" Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-9-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/selection.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/selection.h b/include/linux/selection.h index b7cd23e56a2b..533509f6ba4f 100644 --- a/include/linux/selection.h +++ b/include/linux/selection.h @@ -24,7 +24,7 @@ extern int sel_loadlut(u32 __user *lut); extern int mouse_reporting(void); extern void mouse_report(struct tty_struct * tty, int butt, int mrx, int mry); -bool vc_is_sel(struct vc_data *vc); +bool vc_is_sel(const struct vc_data *vc); extern int console_blanked; -- cgit v1.2.3 From 649f6fbe6abe0c7749120067058709d41111f655 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:32 +0100 Subject: tty: vt: remove extern from functions in selection.h Remove unneeded 'extern' keyword from function prototypes in selection.h. This makes it more readable as no more wrapping is needed on many places. Signed-off-by: "Jiri Slaby (SUSE)" Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-19-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/selection.h | 52 ++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/selection.h b/include/linux/selection.h index 533509f6ba4f..bab7d30d3446 100644 --- a/include/linux/selection.h +++ b/include/linux/selection.h @@ -14,15 +14,14 @@ struct tty_struct; struct vc_data; -extern void clear_selection(void); -extern int set_selection_user(const struct tiocl_selection __user *sel, - struct tty_struct *tty); -extern int set_selection_kernel(struct tiocl_selection *v, - struct tty_struct *tty); -extern int paste_selection(struct tty_struct *tty); -extern int sel_loadlut(u32 __user *lut); -extern int mouse_reporting(void); -extern void mouse_report(struct tty_struct * tty, int butt, int mrx, int mry); +void clear_selection(void); +int set_selection_user(const struct tiocl_selection __user *sel, + struct tty_struct *tty); +int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty); +int paste_selection(struct tty_struct *tty); +int sel_loadlut(u32 __user *lut); +int mouse_reporting(void); +void mouse_report(struct tty_struct *tty, int butt, int mrx, int mry); bool vc_is_sel(const struct vc_data *vc); @@ -33,24 +32,21 @@ extern unsigned char default_red[]; extern unsigned char default_grn[]; extern unsigned char default_blu[]; -extern unsigned short *screen_pos(const struct vc_data *vc, int w_offset, - bool viewed); -extern u16 screen_glyph(const struct vc_data *vc, int offset); -extern u32 screen_glyph_unicode(const struct vc_data *vc, int offset); -extern void complement_pos(struct vc_data *vc, int offset); -extern void invert_screen(struct vc_data *vc, int offset, int count, bool viewed); - -extern void getconsxy(const struct vc_data *vc, unsigned char xy[static 2]); -extern void putconsxy(struct vc_data *vc, unsigned char xy[static const 2]); - -extern u16 vcs_scr_readw(const struct vc_data *vc, const u16 *org); -extern void vcs_scr_writew(struct vc_data *vc, u16 val, u16 *org); -extern void vcs_scr_updated(struct vc_data *vc); - -extern int vc_uniscr_check(struct vc_data *vc); -extern void vc_uniscr_copy_line(const struct vc_data *vc, void *dest, - bool viewed, - unsigned int row, unsigned int col, - unsigned int nr); +unsigned short *screen_pos(const struct vc_data *vc, int w_offset, bool viewed); +u16 screen_glyph(const struct vc_data *vc, int offset); +u32 screen_glyph_unicode(const struct vc_data *vc, int offset); +void complement_pos(struct vc_data *vc, int offset); +void invert_screen(struct vc_data *vc, int offset, int count, bool viewed); + +void getconsxy(const struct vc_data *vc, unsigned char xy[static 2]); +void putconsxy(struct vc_data *vc, unsigned char xy[static const 2]); + +u16 vcs_scr_readw(const struct vc_data *vc, const u16 *org); +void vcs_scr_writew(struct vc_data *vc, u16 val, u16 *org); +void vcs_scr_updated(struct vc_data *vc); + +int vc_uniscr_check(struct vc_data *vc); +void vc_uniscr_copy_line(const struct vc_data *vc, void *dest, bool viewed, + unsigned int row, unsigned int col, unsigned int nr); #endif -- cgit v1.2.3 From 7995c30d8d771c8410d7f2ba5b9d42b69e0074c8 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:33 +0100 Subject: tty: vt: make consw::con_debug_*() return void The return value of con_debug_enter() and con_debug_leave() is ignored on many fronts. So just don't propagate errors (the current implementations return 0 anyway) and make the return type a void. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Daniel Vetter Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-20-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 38b379d6c624..93a1db5bf3b5 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -88,11 +88,11 @@ struct consw { * limited to, unblanking the console, loading an appropriate * palette, and allowing debugger generated output. */ - int (*con_debug_enter)(struct vc_data *vc); + void (*con_debug_enter)(struct vc_data *vc); /* * Restore the console to its pre-debug state as closely as possible. */ - int (*con_debug_leave)(struct vc_data *vc); + void (*con_debug_leave)(struct vc_data *vc); }; extern const struct consw *conswitchp; @@ -113,17 +113,11 @@ int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_HW_CONSOLE -int con_debug_enter(struct vc_data *vc); -int con_debug_leave(void); +void con_debug_enter(struct vc_data *vc); +void con_debug_leave(void); #else -static inline int con_debug_enter(struct vc_data *vc) -{ - return 0; -} -static inline int con_debug_leave(void) -{ - return 0; -} +static inline void con_debug_enter(struct vc_data *vc) { } +static inline void con_debug_leave(void) { } #endif /* cursor */ -- cgit v1.2.3 From dae3e6b6180f1a2394b984c596d39ed2c57d25fe Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:34 +0100 Subject: tty: vt: make init parameter of consw::con_init() a bool The 'init' parameter of consw::con_init() is true for the first call of the hook on a particular console. So make the parameter a bool. And document the hook. Signed-off-by: "Jiri Slaby (SUSE)" Reviewed-by: Geert Uytterhoeven Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-parisc@vger.kernel.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-21-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 93a1db5bf3b5..fc9450e0c78f 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -36,6 +36,8 @@ enum vc_intensity; /** * struct consw - callbacks for consoles * + * @con_init: initialize the console on @vc. @init is true for the very first + * call on this @vc. * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. @@ -46,7 +48,7 @@ enum vc_intensity; struct consw { struct module *owner; const char *(*con_startup)(void); - void (*con_init)(struct vc_data *vc, int init); + void (*con_init)(struct vc_data *vc, bool init); void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, int sy, int sx, int height, int width); -- cgit v1.2.3 From 559f01a0ee6d924c6fec3eaf6a5b078b15e71070 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:35 +0100 Subject: tty: vt: sanitize arguments of consw::con_clear() In consw::con_clear(): * Height is always 1, so drop it. * Offsets and width are always unsigned values, so re-type them as such. This needs a new __fbcon_clear() in the fbcon code to still handle height which might not be 1 when called internally. Note that tests for negative count/width are left in place -- they are taken care of in the next patches. And document the hook. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-parisc@vger.kernel.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-22-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index fc9450e0c78f..8fd96a5fca5f 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -38,6 +38,7 @@ enum vc_intensity; * * @con_init: initialize the console on @vc. @init is true for the very first * call on this @vc. + * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. @@ -50,8 +51,8 @@ struct consw { const char *(*con_startup)(void); void (*con_init)(struct vc_data *vc, bool init); void (*con_deinit)(struct vc_data *vc); - void (*con_clear)(struct vc_data *vc, int sy, int sx, int height, - int width); + void (*con_clear)(struct vc_data *vc, unsigned int y, + unsigned int x, unsigned int count); void (*con_putc)(struct vc_data *vc, int c, int ypos, int xpos); void (*con_putcs)(struct vc_data *vc, const unsigned short *s, int count, int ypos, int xpos); -- cgit v1.2.3 From 338c28107b51083846afdc5fe8f7830cc8abd893 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:39 +0100 Subject: tty: vt: sanitize consw::con_putc() parameters Make parameters of consw::con_putc() saner: * x and y are unsigned now, as they cannot be negative, and * ca is made u16, as it is composed of two 8bit values (character and attribute). See the con_putcs() hook, u16/ushort is worked on there. And document the hook. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-26-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 8fd96a5fca5f..92d57e5b3009 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -39,6 +39,8 @@ enum vc_intensity; * @con_init: initialize the console on @vc. @init is true for the very first * call on this @vc. * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. + * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. + * (optional -- @con_putcs would be called instead) * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. @@ -53,7 +55,8 @@ struct consw { void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, unsigned int y, unsigned int x, unsigned int count); - void (*con_putc)(struct vc_data *vc, int c, int ypos, int xpos); + void (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y, + unsigned int x); void (*con_putcs)(struct vc_data *vc, const unsigned short *s, int count, int ypos, int xpos); void (*con_cursor)(struct vc_data *vc, int mode); -- cgit v1.2.3 From bfd7de49d7444ce46a48e92ce7cb11266ce79905 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:40 +0100 Subject: tty: vt: sanitize consw::con_putcs() parameters Similar to con_putc() in the previous patch: * make the pointer to charattr a pointer to u16, and * make x, y, and count unsigned as they are strictly non-negative. And again, document that hook. Signed-off-by: "Jiri Slaby (SUSE)" Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-27-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 92d57e5b3009..82d55764a66f 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -41,6 +41,7 @@ enum vc_intensity; * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. * (optional -- @con_putcs would be called instead) + * @con_putcs: emit @count characters with attributes @s to [@x, @y] on @vc. * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. @@ -57,8 +58,9 @@ struct consw { unsigned int x, unsigned int count); void (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y, unsigned int x); - void (*con_putcs)(struct vc_data *vc, const unsigned short *s, - int count, int ypos, int xpos); + void (*con_putcs)(struct vc_data *vc, const u16 *s, + unsigned int count, unsigned int ypos, + unsigned int xpos); void (*con_cursor)(struct vc_data *vc, int mode); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, -- cgit v1.2.3 From a292e3fc94cb9795bbba4ddac075a9055cd58a5e Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:43 +0100 Subject: tty: vt: remove CM_* constants There is no difference between CM_MOVE and CM_DRAW. Either of them enables the cursor. CM_ERASE then disables cursor. So get rid of all of them and use simple "bool enable". Note that this propagates down to the fbcon code. And document the hook. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-parisc@vger.kernel.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-30-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 82d55764a66f..a6a46b5efd66 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -42,6 +42,7 @@ enum vc_intensity; * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. * (optional -- @con_putcs would be called instead) * @con_putcs: emit @count characters with attributes @s to [@x, @y] on @vc. + * @con_cursor: enable/disable cursor depending on @enable * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. @@ -61,7 +62,7 @@ struct consw { void (*con_putcs)(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos); - void (*con_cursor)(struct vc_data *vc, int mode); + void (*con_cursor)(struct vc_data *vc, bool enable); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); @@ -128,11 +129,6 @@ static inline void con_debug_enter(struct vc_data *vc) { } static inline void con_debug_leave(void) { } #endif -/* cursor */ -#define CM_DRAW (1) -#define CM_ERASE (2) -#define CM_MOVE (3) - /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) -- cgit v1.2.3 From 8d5cc8eed738e3202379722295c626cba0849785 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:44 +0100 Subject: tty: vt: make consw::con_switch() return a bool The non-zero (true) return value from consw::con_switch() means a redraw is needed. So make this return type a bool explicitly instead of int. The latter might imply that -Eerrors are expected. They are not. And document the hook. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-parisc@vger.kernel.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-31-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index a6a46b5efd66..f7c6b5fc3a36 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -46,6 +46,8 @@ enum vc_intensity; * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. + * @con_switch: notifier about the console switch; it is supposed to return + * true if a redraw is needed. * @con_set_palette: sets the palette of the console to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) @@ -66,7 +68,7 @@ struct consw { bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); - int (*con_switch)(struct vc_data *vc); + bool (*con_switch)(struct vc_data *vc); int (*con_blank)(struct vc_data *vc, int blank, int mode_switch); int (*con_font_set)(struct vc_data *vc, struct console_font *font, unsigned int vpitch, unsigned int flags); -- cgit v1.2.3 From ace4ebf9b70a7daea12102c09ba5ef6bb73223aa Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:46 +0100 Subject: tty: vt: define a common enum for VESA blanking constants There are currently two places with VESA blanking constants definitions: fb.h and console.h. Extract/unify the two to a separate header (vesa.h). Given the fb's is in an uapi header, create the common header in uapi too. Note that instead of macros, an enum (vesa_blank_mode) is created. But the macros are kept too (they now expand to the enum constants), just in case someone in userspace performs some #ifdeffery. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: linux-kernel@vger.kernel.org Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Thomas Zimmermann Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-33-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index f7c6b5fc3a36..860f82756c9c 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -18,6 +18,7 @@ #include #include #include +#include struct vc_data; struct console_font_op; @@ -520,12 +521,6 @@ void vcs_remove_sysfs(int index); */ extern atomic_t ignore_console_lock_warning; -/* VESA Blanking Levels */ -#define VESA_NO_BLANKING 0 -#define VESA_VSYNC_SUSPEND 1 -#define VESA_HSYNC_SUSPEND 2 -#define VESA_POWERDOWN 3 - extern void console_init(void); /* For deferred console takeover */ -- cgit v1.2.3 From 0a58d83dfb14ac30126c37b18d4578e5b261459d Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:48 +0100 Subject: tty: vt: use enum constants for VESA blanking modes Use the new enum for VESA constants. This improves type checking in consw::con_blank(). Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-parisc@vger.kernel.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-35-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 860f82756c9c..69040d7c8f97 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -70,7 +70,8 @@ struct consw { unsigned int bottom, enum con_scroll dir, unsigned int lines); bool (*con_switch)(struct vc_data *vc); - int (*con_blank)(struct vc_data *vc, int blank, int mode_switch); + int (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, + int mode_switch); int (*con_font_set)(struct vc_data *vc, struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, -- cgit v1.2.3 From 77e110936a42b212c0fb576356ed274eb1d90c54 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:49 +0100 Subject: tty: vt: make types around consw::con_blank() bool Both the mode_switch parameter and the return value (a redraw needed) are true/false. So switch them to bool, so that users won't return -Eerrors or anything else. And document the hook. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-parisc@vger.kernel.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-36-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 69040d7c8f97..6392bcd2fe7c 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -49,6 +49,9 @@ enum vc_intensity; * Invoked by csi_M and printing to the console. * @con_switch: notifier about the console switch; it is supposed to return * true if a redraw is needed. + * @con_blank: blank/unblank the console. The target mode is passed in @blank. + * @mode_switch is set if changing from/to text/graphics. The hook + * is supposed to return true if a redraw is needed. * @con_set_palette: sets the palette of the console to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) @@ -70,8 +73,8 @@ struct consw { unsigned int bottom, enum con_scroll dir, unsigned int lines); bool (*con_switch)(struct vc_data *vc); - int (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, - int mode_switch); + bool (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, + bool mode_switch); int (*con_font_set)(struct vc_data *vc, struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, -- cgit v1.2.3 From fd0f631fffa87f1c26045c3c88c0c4a7706d14de Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:50 +0100 Subject: tty: vt: make font of consw::con_font_set() const Provided the font parameter of consw::con_font_set() is not supposed to be changed, make it const. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-parisc@vger.kernel.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-37-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 6392bcd2fe7c..0a9f4cbdde83 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -75,8 +75,9 @@ struct consw { bool (*con_switch)(struct vc_data *vc); bool (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, bool mode_switch); - int (*con_font_set)(struct vc_data *vc, struct console_font *font, - unsigned int vpitch, unsigned int flags); + int (*con_font_set)(struct vc_data *vc, + const struct console_font *font, + unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, -- cgit v1.2.3 From 4f59617065592c446cd8450e9e6bac229cbc1383 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:51 +0100 Subject: tty: vt: make consw::con_font_default()'s name const It's a name after all and that is not supposed to be changed. So make it const to make this obvious. Signed-off-by: "Jiri Slaby (SUSE)" Cc: "James E.J. Bottomley" Cc: Daniel Vetter Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-parisc@vger.kernel.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-38-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 0a9f4cbdde83..6bb7e5e37ae4 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -81,7 +81,7 @@ struct consw { int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, - struct console_font *font, char *name); + struct console_font *font, const char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user); void (*con_set_palette)(struct vc_data *vc, -- cgit v1.2.3 From 42822fabfc24f4fc8d5404d9359fa17a0bcfcea8 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:52 +0100 Subject: tty: vt: change consw::con_set_origin() return type The return value of consw::con_set_origin() is only true/false, meaining if vc->vc_origin is set to vc->vc_screenbuf or not. So switch the type and returned values accordingly. And document the hook. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-39-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 6bb7e5e37ae4..82e4b554a801 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -55,6 +55,9 @@ enum vc_intensity; * @con_set_palette: sets the palette of the console to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) + * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not + * provided or returns false, the origin is set to + * @vc->vc_screenbuf. (optional) */ struct consw { struct module *owner; @@ -87,7 +90,7 @@ struct consw { void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); - int (*con_set_origin)(struct vc_data *vc); + bool (*con_set_origin)(struct vc_data *vc); void (*con_save_screen)(struct vc_data *vc); u8 (*con_build_attr)(struct vc_data *vc, u8 color, enum vc_intensity intensity, -- cgit v1.2.3 From 7cf01c92addb73c3055ff0fc596441c80ce82113 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:54 +0100 Subject: tty: vt: remove consw::con_screen_pos() After the previous patch, nobody sets that hook. So drop it completely. Signed-off-by: "Jiri Slaby (SUSE)" Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-41-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 82e4b554a801..b2d8621cea57 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -96,7 +96,6 @@ struct consw { enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); - u16 *(*con_screen_pos)(const struct vc_data *vc, int offset); unsigned long (*con_getxy)(struct vc_data *vc, unsigned long position, int *px, int *py); /* -- cgit v1.2.3 From f441aa3b441306e35e8fcbec5ac13c68b5f48245 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:57 +0100 Subject: tty: vt: remove consw::con_getxy() After the previous patch, nobody sets that hook. So drop it completely. Signed-off-by: "Jiri Slaby (SUSE)" Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-44-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index b2d8621cea57..fa2cd81102b8 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -96,8 +96,6 @@ struct consw { enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); - unsigned long (*con_getxy)(struct vc_data *vc, unsigned long position, - int *px, int *py); /* * Flush the video console driver's scrollback buffer */ -- cgit v1.2.3 From b23bf1a43bdbce1a281f11169dd9d426018b00c9 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:58 +0100 Subject: tty: vt: remove unused consw::con_flush_scrollback() consw::con_flush_scrollback() is unused since commit 973c096f6a85 (vgacon: remove software scrollback support). Drop it. Signed-off-by: "Jiri Slaby (SUSE)" Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-45-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index fa2cd81102b8..1eac3e6e32a2 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -96,10 +96,6 @@ struct consw { enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); - /* - * Flush the video console driver's scrollback buffer - */ - void (*con_flush_scrollback)(struct vc_data *vc); /* * Prepare the console for the debugger. This includes, but is not * limited to, unblanking the console, loading an appropriate -- cgit v1.2.3 From d1e2221644c490a73d2968fe316f0af170e0ebcf Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:03:59 +0100 Subject: tty: vt: document the rest of struct consw There are still members of struct consw which are not documented yet. Fix that up, so we can generate kernel-doc for that struct. Signed-off-by: "Jiri Slaby (SUSE)" Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-46-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 1eac3e6e32a2..f1a334ad268d 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -37,8 +37,11 @@ enum vc_intensity; /** * struct consw - callbacks for consoles * + * @owner: the module to get references of when this console is used + * @con_startup: set up the console and return its name (like VGA, EGA, ...) * @con_init: initialize the console on @vc. @init is true for the very first * call on this @vc. + * @con_deinit: deinitialize the console from @vc. * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. * (optional -- @con_putcs would be called instead) @@ -52,12 +55,33 @@ enum vc_intensity; * @con_blank: blank/unblank the console. The target mode is passed in @blank. * @mode_switch is set if changing from/to text/graphics. The hook * is supposed to return true if a redraw is needed. - * @con_set_palette: sets the palette of the console to @table (optional) + * @con_font_set: set console @vc font to @font with height @vpitch. @flags can + * be %KD_FONT_FLAG_DONT_RECALC. (optional) + * @con_font_get: fetch the current font on @vc of height @vpitch into @font. + * (optional) + * @con_font_default: set default font on @vc. @name can be %NULL or font name + * to search for. @font can be filled back. (optional) + * @con_resize: resize the @vc console to @width x @height. @from_user is true + * when this change comes from the user space. + * @con_set_palette: sets the palette of the console @vc to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not * provided or returns false, the origin is set to * @vc->vc_screenbuf. (optional) + * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g. + * upon entering graphics. (optional) + * @con_build_attr: build attributes based on @color, @intensity and other + * parameters. The result is used for both normal and erase + * characters. (optional) + * @con_invert_region: invert a region of length @count on @vc starting at @p. + * (optional) + * @con_debug_enter: prepare the console for the debugger. This includes, but + * is not limited to, unblanking the console, loading an + * appropriate palette, and allowing debugger generated output. + * (optional) + * @con_debug_leave: restore the console to its pre-debug state as closely as + * possible. (optional) */ struct consw { struct module *owner; @@ -96,15 +120,7 @@ struct consw { enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); - /* - * Prepare the console for the debugger. This includes, but is not - * limited to, unblanking the console, loading an appropriate - * palette, and allowing debugger generated output. - */ void (*con_debug_enter)(struct vc_data *vc); - /* - * Restore the console to its pre-debug state as closely as possible. - */ void (*con_debug_leave)(struct vc_data *vc); }; -- cgit v1.2.3 From 60234365aee22c9ac576491f787f20a17279d28e Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 22 Jan 2024 12:04:00 +0100 Subject: tty: vt: fix up kernel-doc selection.c and vt.c still uses tabs in the kernel-doc. This misrenders the functions in the output -- sphinx misinterprets the description. So remove these tabs, incl. those around dashes. 'enum' keyword is needed before enum names. Fix that. Superfluous \n after the comments are also removed. They are not completely faulty, but this unifies all the kernel-doc in the files. Finally fix up the cross references. Signed-off-by: "Jiri Slaby (SUSE)" Reviewed-by: Randy Dunlap Tested-by: Helge Deller # parisc STI console Link: https://lore.kernel.org/r/20240122110401.7289-47-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index f1a334ad268d..d6d8b7e6b93b 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -155,7 +155,7 @@ static inline void con_debug_leave(void) { } */ /** - * cons_flags - General console flags + * enum cons_flags - General console flags * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate * output of messages that were already shown by boot * consoles or read by userspace via syslog() syscall. @@ -236,7 +236,7 @@ struct nbcon_state { static_assert(sizeof(struct nbcon_state) <= sizeof(int)); /** - * nbcon_prio - console owner priority for nbcon consoles + * enum nbcon_prio - console owner priority for nbcon consoles * @NBCON_PRIO_NONE: Unused * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) @@ -468,7 +468,7 @@ static inline bool console_is_registered(const struct console *con) * for_each_console() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * - * The console list and the console->flags are immutable while iterating. + * The console list and the &console.flags are immutable while iterating. * * Requires console_list_lock to be held. */ -- cgit v1.2.3 From fed99212acae832607817b24fa589f8aaf03103f Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Mon, 22 Jan 2024 19:05:51 +0100 Subject: treewide, serdev: change receive_buf() return type to size_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit receive_buf() is called from ttyport_receive_buf() that expects values ">= 0" from serdev_controller_receive_buf(), change its return type from ssize_t to size_t. The need for this clean-up was noticed while fixing a warning, see commit 94d053942544 ("Bluetooth: btnxpuart: fix recv_buf() return value"). Changing the callback prototype to return an unsigned seems the best way to document the API and ensure that is properly used. GNSS drivers implementation of serdev receive_buf() callback return directly the return value of gnss_insert_raw(). gnss_insert_raw() returns a signed int, however this is not an issue since the value returned is always positive, because of the kfifo_in() implementation. gnss_insert_raw() could be changed to return also an unsigned, however this is not implemented here as request by the GNSS maintainer Johan Hovold. Suggested-by: Jiri Slaby Link: https://lore.kernel.org/all/087be419-ec6b-47ad-851a-5e1e3ea5cfcc@kernel.org/ Signed-off-by: Francesco Dolcini Acked-by: Jonathan Cameron #for-iio Reviewed-by: Johan Hovold Reviewed-by: Rob Herring Reviewed-by: Alex Elder Acked-by: Maximilian Luz # for platform/surface Acked-by: Lee Jones Acked-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240122180551.34429-1-francesco@dolcini.it Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 3fab88ba265e..ff78efc1f60d 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -27,7 +27,7 @@ struct serdev_device; * not sleep. */ struct serdev_device_ops { - ssize_t (*receive_buf)(struct serdev_device *, const u8 *, size_t); + size_t (*receive_buf)(struct serdev_device *, const u8 *, size_t); void (*write_wakeup)(struct serdev_device *); }; @@ -185,9 +185,9 @@ static inline void serdev_controller_write_wakeup(struct serdev_controller *ctrl serdev->ops->write_wakeup(serdev); } -static inline ssize_t serdev_controller_receive_buf(struct serdev_controller *ctrl, - const u8 *data, - size_t count) +static inline size_t serdev_controller_receive_buf(struct serdev_controller *ctrl, + const u8 *data, + size_t count) { struct serdev_device *serdev = ctrl->serdev; -- cgit v1.2.3 From 85725449f3e5faf385210d535f266430be71cebb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 10 Jan 2024 14:21:46 +0100 Subject: serial: 8250: Move hp300_setup_serial_console() to MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_SERIAL_8250_HP300=y and CONFIG_SERIAL_8250_CONSOLE=y (e.g. m68k/allyesconfig): drivers/tty/serial/8250/8250_hp300.c:91:12: error: no previous prototype for ‘hp300_setup_serial_console’ [-Werror=missing-prototypes] 91 | int __init hp300_setup_serial_console(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by moving the existing prototype in arch/m68k/hp300/config.c to , so it is visible to both caller and implementor. While at it, provide a dummy in case CONFIG_SERIAL_8250_CONSOLE is not enabled, to reduce #ifdef clutter in the caller. Exposed by commit 0fcb70851fbfea17 ("Makefile.extrawarn: turn on missing-prototypes globally"). Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/c17469f8e47b2ef49234a85a7a14882ddf374e41.1704892597.git.geert@linux-m68k.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index be65de65fe61..fd59ed2cca53 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -210,6 +210,12 @@ int serial8250_console_exit(struct uart_port *port); void serial8250_set_isa_configurator(void (*v)(int port, struct uart_port *up, u32 *capabilities)); +#ifdef CONFIG_SERIAL_8250_CONSOLE +extern int hp300_setup_serial_console(void) __init; +#else +static inline int hp300_setup_serial_console(void) { return 0; } +#endif + #ifdef CONFIG_SERIAL_8250_RT288X int rt288x_setup(struct uart_port *p); int au_platform_setup(struct plat_serial8250_port *p); -- cgit v1.2.3 From 486676116f4852d4198690c2c98af060cd96ab83 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 12 Jan 2024 15:03:07 -0800 Subject: soc: qcom: geni-se: Add M_TX_FIFO_NOT_EMPTY bit definition According to the docs I have, bit 21 of the status register is asserted when the FIFO is _not_ empty. Add the definition. Signed-off-by: Douglas Anderson Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240112150307.1.I7dc0993c1e758a1efedd651e7e1670deb1b430fb@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/soc/qcom/geni-se.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/geni-se.h b/include/linux/soc/qcom/geni-se.h index 29e06905bc1f..0f038a1a0330 100644 --- a/include/linux/soc/qcom/geni-se.h +++ b/include/linux/soc/qcom/geni-se.h @@ -178,6 +178,7 @@ struct geni_se { #define M_GP_IRQ_3_EN BIT(12) #define M_GP_IRQ_4_EN BIT(13) #define M_GP_IRQ_5_EN BIT(14) +#define M_TX_FIFO_NOT_EMPTY_EN BIT(21) #define M_IO_DATA_DEASSERT_EN BIT(22) #define M_IO_DATA_ASSERT_EN BIT(23) #define M_RX_FIFO_RD_ERR_EN BIT(24) -- cgit v1.2.3 From 5c49b6a4a4bcf368f85cfe7a0e5ac3a7016f30fd Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 8 Jan 2024 14:41:02 +0100 Subject: vt: remove superfluous CONFIG_HW_CONSOLE The config HW_CONSOLE is always identical to the config VT and is not visible in the kernel's build menuconfig. So, CONFIG_HW_CONSOLE is redundant. Replace all references to CONFIG_HW_CONSOLE with CONFIG_VT and remove CONFIG_HW_CONSOLE. Signed-off-by: Lukas Bulwahn Reviewed-by: Javier Martinez Canillas Acked-by: Dmitry Torokhov Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240108134102.601-1-lukas.bulwahn@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index d6d8b7e6b93b..31a8f5b85f5d 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -141,7 +141,7 @@ int con_is_bound(const struct consw *csw); int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); -#ifdef CONFIG_HW_CONSOLE +#ifdef CONFIG_VT void con_debug_enter(struct vc_data *vc); void con_debug_leave(void); #else -- cgit v1.2.3 From 88b7049635dc5d0e2a7dfaaf89e70a9654ed6561 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 24 Jan 2024 11:06:06 +0100 Subject: gpio: unexport GPIO irq domain functions only used internally There are no external users for the irq domain helpers so unexport them and remove the prototypes from the driver header. Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9a5c6c76e653..363d06c7b637 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -704,18 +704,6 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev, #define BGPIOF_NO_OUTPUT BIT(5) /* only input */ #define BGPIOF_NO_SET_ON_INPUT BIT(6) -int gpiochip_irq_map(struct irq_domain *d, unsigned int irq, - irq_hw_number_t hwirq); -void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq); - -int gpiochip_irq_domain_activate(struct irq_domain *domain, - struct irq_data *data, bool reserve); -void gpiochip_irq_domain_deactivate(struct irq_domain *domain, - struct irq_data *data); - -bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc, - unsigned int offset); - #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain); -- cgit v1.2.3 From 6933ba529d06afdd3faf5501855e410b46b77160 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 25 Jan 2024 09:35:07 +0100 Subject: gpio: improve the API contract for setting direction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a GPIO driver returns a positive integer from one of the direction setter callbacks, we'll end up propagating it to user-space. Whether we should sanitize the values returned by callbacks is a different question but let's first improve the documentation and fortify the contract with GPIO providers. Reported-by: José Guilherme de Castro Rodrigues Signed-off-by: Bartosz Golaszewski Reviewed-by: Kent Gibson --- include/linux/gpio/driver.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 363d06c7b637..3a37d058cfcf 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -335,10 +335,12 @@ struct gpio_irq_chip { * (same as GPIO_LINE_DIRECTION_OUT / GPIO_LINE_DIRECTION_IN), * or negative error. It is recommended to always implement this * function, even on input-only or output-only gpio chips. - * @direction_input: configures signal "offset" as input, or returns error - * This can be omitted on input-only or output-only gpio chips. - * @direction_output: configures signal "offset" as output, or returns error - * This can be omitted on input-only or output-only gpio chips. + * @direction_input: configures signal "offset" as input, returns 0 on success + * or a negative error number. This can be omitted on input-only or + * output-only gpio chips. + * @direction_output: configures signal "offset" as output, returns 0 on + * success or a negative error number. This can be omitted on input-only + * or output-only gpio chips. * @get: returns value for signal "offset", 0=low, 1=high, or negative error * @get_multiple: reads values for multiple signals defined by "mask" and * stores them in "bits", returns 0 on success or negative error -- cgit v1.2.3 From ea1cc3ee34a5f3144f6c2cdc07c19c914ccb9526 Mon Sep 17 00:00:00 2001 From: Min Li Date: Wed, 24 Jan 2024 13:49:46 -0500 Subject: ptp: introduce PTP_CLOCK_EXTOFF event for the measured external offset This change is for the PHC devices that can measure the phase offset between PHC signal and the external signal, such as the 1PPS signal of GNSS. Reporting PTP_CLOCK_EXTOFF to user space will be piggy-backed to the existing ptp_extts_event so that application such as ts2phc can poll the external offset the same way as extts. Hence, ts2phc can use the offset to achieve the alignment between PHC and the external signal by the help of either SW or HW filters. Signed-off-by: Min Li Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 1ef4e0f9bd2a..6e4b8206c7d0 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -200,6 +200,7 @@ struct ptp_clock; enum ptp_clock_events { PTP_CLOCK_ALARM, PTP_CLOCK_EXTTS, + PTP_CLOCK_EXTOFF, PTP_CLOCK_PPS, PTP_CLOCK_PPSUSR, }; @@ -210,6 +211,7 @@ enum ptp_clock_events { * @type: One of the ptp_clock_events enumeration values. * @index: Identifies the source of the event. * @timestamp: When the event occurred (%PTP_CLOCK_EXTTS only). + * @offset: When the event occurred (%PTP_CLOCK_EXTOFF only). * @pps_times: When the event occurred (%PTP_CLOCK_PPSUSR only). */ @@ -218,6 +220,7 @@ struct ptp_clock_event { int index; union { u64 timestamp; + s64 offset; struct pps_event_time pps_times; }; }; -- cgit v1.2.3 From 1ddfecafabf71e0e5345dff877d2680083c7e078 Mon Sep 17 00:00:00 2001 From: Min Li Date: Wed, 24 Jan 2024 13:49:47 -0500 Subject: ptp: add FemtoClock3 Wireless as ptp hardware clock The RENESAS FemtoClock3 Wireless is a high-performance jitter attenuator, frequency translator, and clock synthesizer. The device is comprised of 3 digital PLLs (DPLL) to track CLKIN inputs and three independent low phase noise fractional output dividers (FOD) that output low phase noise clocks. FemtoClock3 supports one Time Synchronization (Time Sync) channel to enable an external processor to control the phase and frequency of the Time Sync channel and to take phase measurements using the TDC. Intended applications are synchronization using the precision time protocol (PTP) and synchronization with 0.5 Hz and 1 Hz signals from GNSS. Signed-off-by: Min Li Acked-by: Lee Jones Signed-off-by: David S. Miller --- include/linux/mfd/idtRC38xxx_reg.h | 273 +++++++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 include/linux/mfd/idtRC38xxx_reg.h (limited to 'include/linux') diff --git a/include/linux/mfd/idtRC38xxx_reg.h b/include/linux/mfd/idtRC38xxx_reg.h new file mode 100644 index 000000000000..ec11872f51ad --- /dev/null +++ b/include/linux/mfd/idtRC38xxx_reg.h @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Register Map - Based on PolarBear_CSRs.RevA.xlsx (2023-04-21) + * + * Copyright (C) 2023 Integrated Device Technology, Inc., a Renesas Company. + */ +#ifndef MFD_IDTRC38XXX_REG +#define MFD_IDTRC38XXX_REG + +/* GLOBAL */ +#define SOFT_RESET_CTRL (0x15) /* Specific to FC3W */ +#define MISC_CTRL (0x14) /* Specific to FC3A */ +#define APLL_REINIT BIT(1) +#define APLL_REINIT_VFC3A BIT(2) + +#define DEVICE_ID (0x2) +#define DEVICE_ID_MASK (0x1000) /* Bit 12 is 1 if FC3W and 0 if FC3A */ +#define DEVICE_ID_SHIFT (12) + +/* FOD */ +#define FOD_0 (0x300) +#define FOD_0_VFC3A (0x400) +#define FOD_1 (0x340) +#define FOD_1_VFC3A (0x440) +#define FOD_2 (0x380) +#define FOD_2_VFC3A (0x480) + +/* TDCAPLL */ +#define TDC_CTRL (0x44a) /* Specific to FC3W */ +#define TDC_ENABLE_CTRL (0x169) /* Specific to FC3A */ +#define TDC_DAC_CAL_CTRL (0x16a) /* Specific to FC3A */ +#define TDC_EN BIT(0) +#define TDC_DAC_RECAL_REQ BIT(1) +#define TDC_DAC_RECAL_REQ_VFC3A BIT(0) + +#define TDC_FB_DIV_INT_CNFG (0x442) +#define TDC_FB_DIV_INT_CNFG_VFC3A (0x162) +#define TDC_FB_DIV_INT_MASK GENMASK(7, 0) +#define TDC_REF_DIV_CNFG (0x443) +#define TDC_REF_DIV_CNFG_VFC3A (0x163) +#define TDC_REF_DIV_CONFIG_MASK GENMASK(2, 0) + +/* TIME SYNC CHANNEL */ +#define TIME_CLOCK_SRC (0xa01) /* Specific to FC3W */ +#define TIME_CLOCK_COUNT (0xa00) /* Specific to FC3W */ +#define TIME_CLOCK_COUNT_MASK GENMASK(5, 0) + +#define SUB_SYNC_GEN_CNFG (0xa04) + +#define TOD_COUNTER_READ_REQ (0xa5f) +#define TOD_COUNTER_READ_REQ_VFC3A (0x6df) +#define TOD_SYNC_LOAD_VAL_CTRL (0xa10) +#define TOD_SYNC_LOAD_VAL_CTRL_VFC3A (0x690) +#define SYNC_COUNTER_MASK GENMASK_ULL(51, 0) +#define SUB_SYNC_COUNTER_MASK GENMASK(30, 0) +#define TOD_SYNC_LOAD_REQ_CTRL (0xa21) +#define TOD_SYNC_LOAD_REQ_CTRL_VFC3A (0x6a1) +#define SYNC_LOAD_ENABLE BIT(1) +#define SUB_SYNC_LOAD_ENABLE BIT(0) +#define SYNC_LOAD_REQ BIT(0) + +#define LPF_MODE_CNFG (0xa80) +#define LPF_MODE_CNFG_VFC3A (0x700) +enum lpf_mode { + LPF_DISABLED = 0, + LPF_WP = 1, + LPF_HOLDOVER = 2, + LPF_WF = 3, + LPF_INVALID = 4 +}; +#define LPF_CTRL (0xa98) +#define LPF_CTRL_VFC3A (0x718) +#define LPF_EN BIT(0) + +#define LPF_BW_CNFG (0xa81) +#define LPF_BW_SHIFT GENMASK(7, 3) +#define LPF_BW_MULT GENMASK(2, 0) +#define LPF_BW_SHIFT_DEFAULT (0xb) +#define LPF_BW_MULT_DEFAULT (0x0) +#define LPF_BW_SHIFT_1PPS (0x5) + +#define LPF_WR_PHASE_CTRL (0xaa8) +#define LPF_WR_PHASE_CTRL_VFC3A (0x728) +#define LPF_WR_FREQ_CTRL (0xab0) +#define LPF_WR_FREQ_CTRL_VFC3A (0x730) + +#define TIME_CLOCK_TDC_FANOUT_CNFG (0xB00) +#define TIME_SYNC_TO_TDC_EN BIT(0) +#define SIG1_MUX_SEL_MASK GENMASK(7, 4) +#define SIG2_MUX_SEL_MASK GENMASK(11, 8) +enum tdc_mux_sel { + REF0 = 0, + REF1 = 1, + REF2 = 2, + REF3 = 3, + REF_CLK5 = 4, + REF_CLK6 = 5, + DPLL_FB_TO_TDC = 6, + DPLL_FB_DIVIDED_TO_TDC = 7, + TIME_CLK_DIVIDED = 8, + TIME_SYNC = 9, +}; + +#define TIME_CLOCK_MEAS_CNFG (0xB04) +#define TDC_MEAS_MODE BIT(0) +enum tdc_meas_mode { + CONTINUOUS = 0, + ONE_SHOT = 1, + MEAS_MODE_INVALID = 2, +}; + +#define TIME_CLOCK_MEAS_DIV_CNFG (0xB08) +#define TIME_REF_DIV_MASK GENMASK(29, 24) + +#define TIME_CLOCK_MEAS_CTRL (0xB10) +#define TDC_MEAS_EN BIT(0) +#define TDC_MEAS_START BIT(1) + +#define TDC_FIFO_READ_REQ (0xB2F) +#define TDC_FIFO_READ (0xB30) +#define COARSE_MEAS_MASK GENMASK_ULL(39, 13) +#define FINE_MEAS_MASK GENMASK(12, 0) + +#define TDC_FIFO_CTRL (0xB12) +#define FIFO_CLEAR BIT(0) +#define TDC_FIFO_STS (0xB38) +#define FIFO_FULL BIT(1) +#define FIFO_EMPTY BIT(0) +#define TDC_FIFO_EVENT (0xB39) +#define FIFO_OVERRUN BIT(1) + +/* DPLL */ +#define MAX_REFERENCE_INDEX (3) +#define MAX_NUM_REF_PRIORITY (4) + +#define MAX_DPLL_INDEX (2) + +#define DPLL_STS (0x580) +#define DPLL_STS_VFC3A (0x571) +#define DPLL_STATE_STS_MASK (0x70) +#define DPLL_STATE_STS_SHIFT (4) +#define DPLL_REF_SEL_STS_MASK (0x6) +#define DPLL_REF_SEL_STS_SHIFT (1) + +#define DPLL_REF_PRIORITY_CNFG (0x502) +#define DPLL_REFX_PRIORITY_DISABLE_MASK (0xf) +#define DPLL_REF0_PRIORITY_ENABLE_AND_SET_MASK (0x31) +#define DPLL_REF1_PRIORITY_ENABLE_AND_SET_MASK (0xc2) +#define DPLL_REF2_PRIORITY_ENABLE_AND_SET_MASK (0x304) +#define DPLL_REF3_PRIORITY_ENABLE_AND_SET_MASK (0xc08) +#define DPLL_REF0_PRIORITY_SHIFT (4) +#define DPLL_REF1_PRIORITY_SHIFT (6) +#define DPLL_REF2_PRIORITY_SHIFT (8) +#define DPLL_REF3_PRIORITY_SHIFT (10) + +enum dpll_state { + DPLL_STATE_MIN = 0, + DPLL_STATE_FREERUN = DPLL_STATE_MIN, + DPLL_STATE_LOCKED = 1, + DPLL_STATE_HOLDOVER = 2, + DPLL_STATE_WRITE_FREQUENCY = 3, + DPLL_STATE_ACQUIRE = 4, + DPLL_STATE_HITLESS_SWITCH = 5, + DPLL_STATE_MAX = DPLL_STATE_HITLESS_SWITCH +}; + +/* REFMON */ +#define LOSMON_STS_0 (0x81e) +#define LOSMON_STS_0_VFC3A (0x18e) +#define LOSMON_STS_1 (0x82e) +#define LOSMON_STS_1_VFC3A (0x19e) +#define LOSMON_STS_2 (0x83e) +#define LOSMON_STS_2_VFC3A (0x1ae) +#define LOSMON_STS_3 (0x84e) +#define LOSMON_STS_3_VFC3A (0x1be) +#define LOS_STS_MASK (0x1) + +#define FREQMON_STS_0 (0x874) +#define FREQMON_STS_0_VFC3A (0x1d4) +#define FREQMON_STS_1 (0x894) +#define FREQMON_STS_1_VFC3A (0x1f4) +#define FREQMON_STS_2 (0x8b4) +#define FREQMON_STS_2_VFC3A (0x214) +#define FREQMON_STS_3 (0x8d4) +#define FREQMON_STS_3_VFC3A (0x234) +#define FREQ_FAIL_STS_SHIFT (31) + +/* Firmware interface */ +#define TIME_CLK_FREQ_ADDR (0xffa0) +#define XTAL_FREQ_ADDR (0xffa1) + +/* + * Return register address and field mask based on passed in firmware version + */ +#define IDTFC3_FW_REG(FW, VER, REG) (((FW) < (VER)) ? (REG) : (REG##_##VER)) +#define IDTFC3_FW_FIELD(FW, VER, FIELD) (((FW) < (VER)) ? (FIELD) : (FIELD##_##VER)) +enum fw_version { + V_DEFAULT = 0, + VFC3W = 1, + VFC3A = 2 +}; + +/* XTAL_FREQ_ADDR/TIME_CLK_FREQ_ADDR */ +enum { + FREQ_MIN = 0, + FREQ_25M = 1, + FREQ_49_152M = 2, + FREQ_50M = 3, + FREQ_100M = 4, + FREQ_125M = 5, + FREQ_250M = 6, + FREQ_MAX +}; + +struct idtfc3_hw_param { + u32 xtal_freq; + u32 time_clk_freq; +}; + +struct idtfc3_fwrc { + u8 hiaddr; + u8 loaddr; + u8 value; + u8 reserved; +} __packed; + +static inline void idtfc3_default_hw_param(struct idtfc3_hw_param *hw_param) +{ + hw_param->xtal_freq = 49152000; + hw_param->time_clk_freq = 25000000; +} + +static inline int idtfc3_set_hw_param(struct idtfc3_hw_param *hw_param, + u16 addr, u8 val) +{ + if (addr == XTAL_FREQ_ADDR) + switch (val) { + case FREQ_49_152M: + hw_param->xtal_freq = 49152000; + break; + case FREQ_50M: + hw_param->xtal_freq = 50000000; + break; + default: + return -EINVAL; + } + else if (addr == TIME_CLK_FREQ_ADDR) + switch (val) { + case FREQ_25M: + hw_param->time_clk_freq = 25000000; + break; + case FREQ_50M: + hw_param->time_clk_freq = 50000000; + break; + case FREQ_100M: + hw_param->time_clk_freq = 100000000; + break; + case FREQ_125M: + hw_param->time_clk_freq = 125000000; + break; + case FREQ_250M: + hw_param->time_clk_freq = 250000000; + break; + default: + return -EINVAL; + } + else + return -EFAULT; + + return 0; +} + +#endif -- cgit v1.2.3 From efaa47db92451608499ab7edf108bf30141c33db Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 28 Jan 2024 13:54:43 +0800 Subject: bpf: Remove unused field "mod" in struct bpf_trampoline It seems that the field "mod" in struct bpf_trampoline is not used anywhere after the commit 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules"). So we can just remove it now. Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules") Signed-off-by: Menglong Dong Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240128055443.413291-1-dongmenglong.8@bytedance.com --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b86bd15a051d..1ebbee1d648e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1189,7 +1189,6 @@ struct bpf_trampoline { int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ struct bpf_tramp_image *cur_image; - struct module *mod; }; struct bpf_attach_target_info { -- cgit v1.2.3 From 04d65a9dbb33e20500005e151d720acead78c539 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Thu, 25 Jan 2024 22:11:03 -0600 Subject: iommu/amd: Don't rely on external callers to enable IOMMU SNP support Currently, the expectation is that the kernel will call amd_iommu_snp_enable() to perform various checks and set the amd_iommu_snp_en flag that the IOMMU uses to adjust its setup routines to account for additional requirements on hosts where SNP is enabled. This is somewhat fragile as it relies on this call being done prior to IOMMU setup. It is more robust to just do this automatically as part of IOMMU initialization, so rework the code accordingly. There is still a need to export information about whether or not the IOMMU is configured in a manner compatible with SNP, so relocate the existing amd_iommu_snp_en flag so it can be used to convey that information in place of the return code that was previously provided by calls to amd_iommu_snp_enable(). While here, also adjust the kernel messages related to IOMMU SNP enablement for consistency/grammar/clarity. Suggested-by: Borislav Petkov (AMD) Signed-off-by: Ashish Kalra Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Acked-by: Joerg Roedel Link: https://lore.kernel.org/r/20240126041126.1927228-4-michael.roth@amd.com --- include/linux/amd-iommu.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index dc7ed2f46886..7365be00a795 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -85,8 +85,4 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value); struct amd_iommu *get_amd_iommu(unsigned int idx); -#ifdef CONFIG_AMD_MEM_ENCRYPT -int amd_iommu_snp_enable(void); -#endif - #endif /* _ASM_X86_AMD_IOMMU_H */ -- cgit v1.2.3 From 5797b1c18919cd9c289ded7954383e499f729ce0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jan 2024 08:11:25 -1000 Subject: workqueue: Implement system-wide nr_active enforcement for unbound workqueues A pool_workqueue (pwq) represents the connection between a workqueue and a worker_pool. One of the roles that a pwq plays is enforcement of the max_active concurrency limit. Before 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues"), there was one pwq per each CPU for per-cpu workqueues and per each NUMA node for unbound workqueues, which was a natural result of per-cpu workqueues being served by per-cpu pools and unbound by per-NUMA pools. In terms of max_active enforcement, this was, while not perfect, workable. For per-cpu workqueues, it was fine. For unbound, it wasn't great in that NUMA machines would get max_active that's multiplied by the number of nodes but didn't cause huge problems because NUMA machines are relatively rare and the node count is usually pretty low. However, cache layouts are more complex now and sharing a worker pool across a whole node didn't really work well for unbound workqueues. Thus, a series of commits culminating on 8639ecebc9b1 ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") implemented more flexible affinity mechanism for unbound workqueues which enables using e.g. last-level-cache aligned pools. In the process, 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") made unbound workqueues use per-cpu pwqs like per-cpu workqueues. While the change was necessary to enable more flexible affinity scopes, this came with the side effect of blowing up the effective max_active for unbound workqueues. Before, the effective max_active for unbound workqueues was multiplied by the number of nodes. After, by the number of CPUs. 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") claims that this should generally be okay. It is okay for users which self-regulates concurrency level which are the vast majority; however, there are enough use cases which actually depend on max_active to prevent the level of concurrency from going bonkers including several IO handling workqueues that can issue a work item for each in-flight IO. With targeted benchmarks, the misbehavior can easily be exposed as reported in http://lkml.kernel.org/r/dbu6wiwu3sdhmhikb2w6lns7b27gbobfavhjj57kwi2quafgwl@htjcc5oikcr3. Unfortunately, there is no way to express what these use cases need using per-cpu max_active. A CPU may issue most of in-flight IOs, so we don't want to set max_active too low but as soon as we increase max_active a bit, we can end up with unreasonable number of in-flight work items when many CPUs issue IOs at the same time. ie. The acceptable lowest max_active is higher than the acceptable highest max_active. Ideally, max_active for an unbound workqueue should be system-wide so that the users can regulate the total level of concurrency regardless of node and cache layout. The reasons workqueue hasn't implemented that yet are: - One max_active enforcement decouples from pool boundaires, chaining execution after a work item finishes requires inter-pool operations which would require lock dancing, which is nasty. - Sharing a single nr_active count across the whole system can be pretty expensive on NUMA machines. - Per-pwq enforcement had been more or less okay while we were using per-node pools. It looks like we no longer can avoid decoupling max_active enforcement from pool boundaries. This patch implements system-wide nr_active mechanism with the following design characteristics: - To avoid sharing a single counter across multiple nodes, the configured max_active is split across nodes according to the proportion of each workqueue's online effective CPUs per node. e.g. A node with twice more online effective CPUs will get twice higher portion of max_active. - Workqueue used to be able to process a chain of interdependent work items which is as long as max_active. We can't do this anymore as max_active is distributed across the nodes. Instead, a new parameter min_active is introduced which determines the minimum level of concurrency within a node regardless of how max_active distribution comes out to be. It is set to the smaller of max_active and WQ_DFL_MIN_ACTIVE which is 8. This can lead to higher effective max_weight than configured and also deadlocks if a workqueue was depending on being able to handle chains of interdependent work items that are longer than 8. I believe these should be fine given that the number of CPUs in each NUMA node is usually higher than 8 and work item chain longer than 8 is pretty unlikely. However, if these assumptions turn out to be wrong, we'll need to add an interface to adjust min_active. - Each unbound wq has an array of struct wq_node_nr_active which tracks per-node nr_active. When its pwq wants to run a work item, it has to obtain the matching node's nr_active. If over the node's max_active, the pwq is queued on wq_node_nr_active->pending_pwqs. As work items finish, the completion path round-robins the pending pwqs activating the first inactive work item of each, which involves some pool lock dancing and kicking other pools. It's not the simplest code but doesn't look too bad. v4: - wq_adjust_max_active() updated to invoke wq_update_node_max_active(). - wq_adjust_max_active() is now protected by wq->mutex instead of wq_pool_mutex. v3: - wq_node_max_active() used to calculate per-node max_active on the fly based on system-wide CPU online states. Lai pointed out that this can lead to skewed distributions for workqueues with restricted cpumasks. Update the max_active distribution to use per-workqueue effective online CPU counts instead of system-wide and cache the calculation results in node_nr_active->max. v2: - wq->min/max_active now uses WRITE/READ_ONCE() as suggested by Lai. Signed-off-by: Tejun Heo Reported-by: Naohiro Aota Link: http://lkml.kernel.org/r/dbu6wiwu3sdhmhikb2w6lns7b27gbobfavhjj57kwi2quafgwl@htjcc5oikcr3 Fixes: 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 78047d0d9882..232baea90a1d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -398,6 +398,13 @@ enum wq_consts { WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, + + /* + * Per-node default cap on min_active. Unless explicitly set, min_active + * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see + * workqueue_struct->min_active definition. + */ + WQ_DFL_MIN_ACTIVE = 8, }; /* @@ -440,11 +447,33 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags - * @max_active: max in-flight work items per CPU, 0 for default + * @max_active: max in-flight work items, 0 for default * remaining args: args for @fmt * - * Allocate a workqueue with the specified parameters. For detailed - * information on WQ_* flags, please refer to + * For a per-cpu workqueue, @max_active limits the number of in-flight work + * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be + * executing at most one work item for the workqueue. + * + * For unbound workqueues, @max_active limits the number of in-flight work items + * for the whole system. e.g. @max_active of 16 indicates that that there can be + * at most 16 work items executing for the workqueue in the whole system. + * + * As sharing the same active counter for an unbound workqueue across multiple + * NUMA nodes can be expensive, @max_active is distributed to each NUMA node + * according to the proportion of the number of online CPUs and enforced + * independently. + * + * Depending on online CPU distribution, a node may end up with per-node + * max_active which is significantly lower than @max_active, which can lead to + * deadlocks if the per-node concurrency limit is lower than the maximum number + * of interdependent work items for the workqueue. + * + * To guarantee forward progress regardless of online CPU distribution, the + * concurrency limit on every node is guaranteed to be equal to or greater than + * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means + * that the sum of per-node max_active's may be larger than @max_active. + * + * For detailed information on %WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: -- cgit v1.2.3 From 3a45dc2b419e691f3dd7fb42c2a1b1cc8146be4f Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 25 Jan 2024 22:11:12 -0600 Subject: crypto: ccp: Define the SEV-SNP commands AMD introduced the next generation of SEV called SEV-SNP (Secure Nested Paging). SEV-SNP builds upon existing SEV and SEV-ES functionality while adding new hardware security protection. Define the commands and structures used to communicate with the AMD-SP when creating and managing the SEV-SNP guests. The SEV-SNP firmware spec is available at developer.amd.com/sev. [ mdr: update SNP command list and SNP status struct based on current spec, use C99 flexible arrays, fix kernel-doc issues. ] Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240126041126.1927228-13-michael.roth@amd.com --- include/linux/psp-sev.h | 265 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 7fd17e82bab4..006e4cdbeb78 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -78,6 +78,36 @@ enum sev_cmd { SEV_CMD_DBG_DECRYPT = 0x060, SEV_CMD_DBG_ENCRYPT = 0x061, + /* SNP specific commands */ + SEV_CMD_SNP_INIT = 0x081, + SEV_CMD_SNP_SHUTDOWN = 0x082, + SEV_CMD_SNP_PLATFORM_STATUS = 0x083, + SEV_CMD_SNP_DF_FLUSH = 0x084, + SEV_CMD_SNP_INIT_EX = 0x085, + SEV_CMD_SNP_SHUTDOWN_EX = 0x086, + SEV_CMD_SNP_DECOMMISSION = 0x090, + SEV_CMD_SNP_ACTIVATE = 0x091, + SEV_CMD_SNP_GUEST_STATUS = 0x092, + SEV_CMD_SNP_GCTX_CREATE = 0x093, + SEV_CMD_SNP_GUEST_REQUEST = 0x094, + SEV_CMD_SNP_ACTIVATE_EX = 0x095, + SEV_CMD_SNP_LAUNCH_START = 0x0A0, + SEV_CMD_SNP_LAUNCH_UPDATE = 0x0A1, + SEV_CMD_SNP_LAUNCH_FINISH = 0x0A2, + SEV_CMD_SNP_DBG_DECRYPT = 0x0B0, + SEV_CMD_SNP_DBG_ENCRYPT = 0x0B1, + SEV_CMD_SNP_PAGE_SWAP_OUT = 0x0C0, + SEV_CMD_SNP_PAGE_SWAP_IN = 0x0C1, + SEV_CMD_SNP_PAGE_MOVE = 0x0C2, + SEV_CMD_SNP_PAGE_MD_INIT = 0x0C3, + SEV_CMD_SNP_PAGE_SET_STATE = 0x0C6, + SEV_CMD_SNP_PAGE_RECLAIM = 0x0C7, + SEV_CMD_SNP_PAGE_UNSMASH = 0x0C8, + SEV_CMD_SNP_CONFIG = 0x0C9, + SEV_CMD_SNP_DOWNLOAD_FIRMWARE_EX = 0x0CA, + SEV_CMD_SNP_COMMIT = 0x0CB, + SEV_CMD_SNP_VLEK_LOAD = 0x0CD, + SEV_CMD_MAX, }; @@ -523,6 +553,241 @@ struct sev_data_attestation_report { u32 len; /* In/Out */ } __packed; +/** + * struct sev_data_snp_download_firmware - SNP_DOWNLOAD_FIRMWARE command params + * + * @address: physical address of firmware image + * @len: length of the firmware image + */ +struct sev_data_snp_download_firmware { + u64 address; /* In */ + u32 len; /* In */ +} __packed; + +/** + * struct sev_data_snp_activate - SNP_ACTIVATE command params + * + * @gctx_paddr: system physical address guest context page + * @asid: ASID to bind to the guest + */ +struct sev_data_snp_activate { + u64 gctx_paddr; /* In */ + u32 asid; /* In */ +} __packed; + +/** + * struct sev_data_snp_addr - generic SNP command params + * + * @address: physical address of generic data param + */ +struct sev_data_snp_addr { + u64 address; /* In/Out */ +} __packed; + +/** + * struct sev_data_snp_launch_start - SNP_LAUNCH_START command params + * + * @gctx_paddr: system physical address of guest context page + * @policy: guest policy + * @ma_gctx_paddr: system physical address of migration agent + * @ma_en: the guest is associated with a migration agent + * @imi_en: launch flow is launching an IMI (Incoming Migration Image) for the + * purpose of guest-assisted migration. + * @rsvd: reserved + * @gosvw: guest OS-visible workarounds, as defined by hypervisor + */ +struct sev_data_snp_launch_start { + u64 gctx_paddr; /* In */ + u64 policy; /* In */ + u64 ma_gctx_paddr; /* In */ + u32 ma_en:1; /* In */ + u32 imi_en:1; /* In */ + u32 rsvd:30; + u8 gosvw[16]; /* In */ +} __packed; + +/* SNP support page type */ +enum { + SNP_PAGE_TYPE_NORMAL = 0x1, + SNP_PAGE_TYPE_VMSA = 0x2, + SNP_PAGE_TYPE_ZERO = 0x3, + SNP_PAGE_TYPE_UNMEASURED = 0x4, + SNP_PAGE_TYPE_SECRET = 0x5, + SNP_PAGE_TYPE_CPUID = 0x6, + + SNP_PAGE_TYPE_MAX +}; + +/** + * struct sev_data_snp_launch_update - SNP_LAUNCH_UPDATE command params + * + * @gctx_paddr: system physical address of guest context page + * @page_size: page size 0 indicates 4K and 1 indicates 2MB page + * @page_type: encoded page type + * @imi_page: indicates that this page is part of the IMI (Incoming Migration + * Image) of the guest + * @rsvd: reserved + * @rsvd2: reserved + * @address: system physical address of destination page to encrypt + * @rsvd3: reserved + * @vmpl1_perms: VMPL permission mask for VMPL1 + * @vmpl2_perms: VMPL permission mask for VMPL2 + * @vmpl3_perms: VMPL permission mask for VMPL3 + * @rsvd4: reserved + */ +struct sev_data_snp_launch_update { + u64 gctx_paddr; /* In */ + u32 page_size:1; /* In */ + u32 page_type:3; /* In */ + u32 imi_page:1; /* In */ + u32 rsvd:27; + u32 rsvd2; + u64 address; /* In */ + u32 rsvd3:8; + u32 vmpl1_perms:8; /* In */ + u32 vmpl2_perms:8; /* In */ + u32 vmpl3_perms:8; /* In */ + u32 rsvd4; +} __packed; + +/** + * struct sev_data_snp_launch_finish - SNP_LAUNCH_FINISH command params + * + * @gctx_paddr: system physical address of guest context page + * @id_block_paddr: system physical address of ID block + * @id_auth_paddr: system physical address of ID block authentication structure + * @id_block_en: indicates whether ID block is present + * @auth_key_en: indicates whether author key is present in authentication structure + * @rsvd: reserved + * @host_data: host-supplied data for guest, not interpreted by firmware + */ +struct sev_data_snp_launch_finish { + u64 gctx_paddr; + u64 id_block_paddr; + u64 id_auth_paddr; + u8 id_block_en:1; + u8 auth_key_en:1; + u64 rsvd:62; + u8 host_data[32]; +} __packed; + +/** + * struct sev_data_snp_guest_status - SNP_GUEST_STATUS command params + * + * @gctx_paddr: system physical address of guest context page + * @address: system physical address of guest status page + */ +struct sev_data_snp_guest_status { + u64 gctx_paddr; + u64 address; +} __packed; + +/** + * struct sev_data_snp_page_reclaim - SNP_PAGE_RECLAIM command params + * + * @paddr: system physical address of page to be claimed. The 0th bit in the + * address indicates the page size. 0h indicates 4KB and 1h indicates + * 2MB page. + */ +struct sev_data_snp_page_reclaim { + u64 paddr; +} __packed; + +/** + * struct sev_data_snp_page_unsmash - SNP_PAGE_UNSMASH command params + * + * @paddr: system physical address of page to be unsmashed. The 0th bit in the + * address indicates the page size. 0h indicates 4 KB and 1h indicates + * 2 MB page. + */ +struct sev_data_snp_page_unsmash { + u64 paddr; +} __packed; + +/** + * struct sev_data_snp_dbg - DBG_ENCRYPT/DBG_DECRYPT command parameters + * + * @gctx_paddr: system physical address of guest context page + * @src_addr: source address of data to operate on + * @dst_addr: destination address of data to operate on + */ +struct sev_data_snp_dbg { + u64 gctx_paddr; /* In */ + u64 src_addr; /* In */ + u64 dst_addr; /* In */ +} __packed; + +/** + * struct sev_data_snp_guest_request - SNP_GUEST_REQUEST command params + * + * @gctx_paddr: system physical address of guest context page + * @req_paddr: system physical address of request page + * @res_paddr: system physical address of response page + */ +struct sev_data_snp_guest_request { + u64 gctx_paddr; /* In */ + u64 req_paddr; /* In */ + u64 res_paddr; /* In */ +} __packed; + +/** + * struct sev_data_snp_init_ex - SNP_INIT_EX structure + * + * @init_rmp: indicate that the RMP should be initialized. + * @list_paddr_en: indicate that list_paddr is valid + * @rsvd: reserved + * @rsvd1: reserved + * @list_paddr: system physical address of range list + * @rsvd2: reserved + */ +struct sev_data_snp_init_ex { + u32 init_rmp:1; + u32 list_paddr_en:1; + u32 rsvd:30; + u32 rsvd1; + u64 list_paddr; + u8 rsvd2[48]; +} __packed; + +/** + * struct sev_data_range - RANGE structure + * + * @base: system physical address of first byte of range + * @page_count: number of 4KB pages in this range + * @rsvd: reserved + */ +struct sev_data_range { + u64 base; + u32 page_count; + u32 rsvd; +} __packed; + +/** + * struct sev_data_range_list - RANGE_LIST structure + * + * @num_elements: number of elements in RANGE_ARRAY + * @rsvd: reserved + * @ranges: array of num_elements of type RANGE + */ +struct sev_data_range_list { + u32 num_elements; + u32 rsvd; + struct sev_data_range ranges[]; +} __packed; + +/** + * struct sev_data_snp_shutdown_ex - SNP_SHUTDOWN_EX structure + * + * @len: length of the command buffer read by the PSP + * @iommu_snp_shutdown: Disable enforcement of SNP in the IOMMU + * @rsvd1: reserved + */ +struct sev_data_snp_shutdown_ex { + u32 len; + u32 iommu_snp_shutdown:1; + u32 rsvd1:31; +} __packed; + #ifdef CONFIG_CRYPTO_DEV_SP_PSP /** -- cgit v1.2.3 From 1ca5614b84eed5904f65f143e0e7aaab0ac4c6b2 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 25 Jan 2024 22:11:13 -0600 Subject: crypto: ccp: Add support to initialize the AMD-SP for SEV-SNP Before SNP VMs can be launched, the platform must be appropriately configured and initialized via the SNP_INIT command. During the execution of SNP_INIT command, the firmware configures and enables SNP security policy enforcement in many system components. Some system components write to regions of memory reserved by early x86 firmware (e.g. UEFI). Other system components write to regions provided by the operation system, hypervisor, or x86 firmware. Such system components can only write to HV-fixed pages or Default pages. They will error when attempting to write to pages in other page states after SNP_INIT enables their SNP enforcement. Starting in SNP firmware v1.52, the SNP_INIT_EX command takes a list of system physical address ranges to convert into the HV-fixed page states during the RMP initialization. If INIT_RMP is 1, hypervisors should provide all system physical address ranges that the hypervisor will never assign to a guest until the next RMP re-initialization. For instance, the memory that UEFI reserves should be included in the range list. This allows system components that occasionally write to memory (e.g. logging to UEFI reserved regions) to not fail due to RMP initialization and SNP enablement. Note that SNP_INIT(_EX) must not be executed while non-SEV guests are executing, otherwise it is possible that the system could reset or hang. The psp_init_on_probe module parameter was added for SEV/SEV-ES support and the init_ex_path module parameter to allow for time for the necessary file system to be mounted/available. SNP_INIT(_EX) does not use the file associated with init_ex_path. So, to avoid running into issues where SNP_INIT(_EX) is called while there are other running guests, issue it during module probe regardless of the psp_init_on_probe setting, but maintain the previous deferrable handling for SEV/SEV-ES initialization. [ mdr: Squash in psp_init_on_probe changes from Tom, reduce proliferation of 'probe' function parameter where possible. bp: Fix 32-bit allmodconfig build. ] Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Co-developed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Tom Lendacky Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240126041126.1927228-14-michael.roth@amd.com --- include/linux/psp-sev.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 006e4cdbeb78..ef3b91797a8b 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -788,12 +788,25 @@ struct sev_data_snp_shutdown_ex { u32 rsvd1:31; } __packed; +/** + * struct sev_platform_init_args + * + * @error: SEV firmware error code + * @probe: True if this is being called as part of CCP module probe, which + * will defer SEV_INIT/SEV_INIT_EX firmware initialization until needed + * unless psp_init_on_probe module param is set + */ +struct sev_platform_init_args { + int error; + bool probe; +}; + #ifdef CONFIG_CRYPTO_DEV_SP_PSP /** * sev_platform_init - perform SEV INIT command * - * @error: SEV command return code + * @args: struct sev_platform_init_args to pass in arguments * * Returns: * 0 if the SEV successfully processed the command @@ -802,7 +815,7 @@ struct sev_data_snp_shutdown_ex { * -%ETIMEDOUT if the SEV command timed out * -%EIO if the SEV returned a non-zero return code */ -int sev_platform_init(int *error); +int sev_platform_init(struct sev_platform_init_args *args); /** * sev_platform_status - perform SEV PLATFORM_STATUS command @@ -909,7 +922,7 @@ void *psp_copy_user_blob(u64 uaddr, u32 len); static inline int sev_platform_status(struct sev_user_data_status *status, int *error) { return -ENODEV; } -static inline int sev_platform_init(int *error) { return -ENODEV; } +static inline int sev_platform_init(struct sev_platform_init_args *args) { return -ENODEV; } static inline int sev_guest_deactivate(struct sev_data_deactivate *data, int *error) { return -ENODEV; } -- cgit v1.2.3 From 18085ac2f2fbf56aee9cbf5846740150e394f4f4 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 25 Jan 2024 22:11:14 -0600 Subject: crypto: ccp: Provide an API to issue SEV and SNP commands Export sev_do_cmd() as a generic API for the hypervisor to issue commands to manage an SEV or an SNP guest. The commands for SEV and SNP are defined in the SEV and SEV-SNP firmware specifications. Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240126041126.1927228-15-michael.roth@amd.com --- include/linux/psp-sev.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index ef3b91797a8b..bcf9ceda030f 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -915,6 +915,22 @@ int sev_guest_df_flush(int *error); */ int sev_guest_decommission(struct sev_data_decommission *data, int *error); +/** + * sev_do_cmd - issue an SEV or an SEV-SNP command + * + * @cmd: SEV or SEV-SNP firmware command to issue + * @data: arguments for firmware command + * @psp_ret: SEV command return code + * + * Returns: + * 0 if the SEV device successfully processed the command + * -%ENODEV if the PSP device is not available + * -%ENOTSUPP if PSP device does not support SEV + * -%ETIMEDOUT if the SEV command timed out + * -%EIO if PSP device returned a non-zero return code + */ +int sev_do_cmd(int cmd, void *data, int *psp_ret); + void *psp_copy_user_blob(u64 uaddr, u32 len); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ @@ -930,6 +946,9 @@ sev_guest_deactivate(struct sev_data_deactivate *data, int *error) { return -ENO static inline int sev_guest_decommission(struct sev_data_decommission *data, int *error) { return -ENODEV; } +static inline int +sev_do_cmd(int cmd, void *data, int *psp_ret) { return -ENODEV; } + static inline int sev_guest_activate(struct sev_data_activate *data, int *error) { return -ENODEV; } -- cgit v1.2.3 From 24512afa4336a1c14de750238abe32759cfba4b0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 25 Jan 2024 22:11:16 -0600 Subject: crypto: ccp: Handle the legacy TMR allocation when SNP is enabled The behavior and requirement for the SEV-legacy command is altered when the SNP firmware is in the INIT state. See SEV-SNP firmware ABI specification for more details. Allocate the Trusted Memory Region (TMR) as a 2MB-sized/aligned region when SNP is enabled to satisfy new requirements for SNP. Continue allocating a 1MB-sized region for !SNP configuration. [ bp: Carve out TMR allocation into a helper. ] Signed-off-by: Brijesh Singh Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240126041126.1927228-17-michael.roth@amd.com --- include/linux/psp-sev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index bcf9ceda030f..84eabbfbbc08 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -932,6 +932,8 @@ int sev_guest_decommission(struct sev_data_decommission *data, int *error); int sev_do_cmd(int cmd, void *data, int *psp_ret); void *psp_copy_user_blob(u64 uaddr, u32 len); +void *snp_alloc_firmware_page(gfp_t mask); +void snp_free_firmware_page(void *addr); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ @@ -959,6 +961,13 @@ sev_issue_cmd_external_user(struct file *filep, unsigned int id, void *data, int static inline void *psp_copy_user_blob(u64 __user uaddr, u32 len) { return ERR_PTR(-EINVAL); } +static inline void *snp_alloc_firmware_page(gfp_t mask) +{ + return NULL; +} + +static inline void snp_free_firmware_page(void *addr) { } + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ #endif /* __PSP_SEV_H__ */ -- cgit v1.2.3 From f366a8dac1b8fef28a470d4e67b9843ebb8e2a1f Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Thu, 25 Jan 2024 22:11:19 -0600 Subject: iommu/amd: Clean up RMP entries for IOMMU pages during SNP shutdown Add a new IOMMU API interface amd_iommu_snp_disable() to transition IOMMU pages to Hypervisor state from Reclaim state after SNP_SHUTDOWN_EX command. Invoke this API from the CCP driver after SNP_SHUTDOWN_EX command. Signed-off-by: Ashish Kalra Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240126041126.1927228-20-michael.roth@amd.com --- include/linux/amd-iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 7365be00a795..2b90c48a6a87 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -85,4 +85,10 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value); struct amd_iommu *get_amd_iommu(unsigned int idx); +#ifdef CONFIG_KVM_AMD_SEV +int amd_iommu_snp_disable(void); +#else +static inline int amd_iommu_snp_disable(void) { return 0; } +#endif + #endif /* _ASM_X86_AMD_IOMMU_H */ -- cgit v1.2.3 From fad133c79afa02344d05001324a0474e20f3e055 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 25 Jan 2024 22:11:24 -0600 Subject: crypto: ccp: Add the SNP_COMMIT command The SNP_COMMIT command is used to commit the currently installed version of the SEV firmware. Once committed, the firmware cannot be replaced with a previous firmware version (cannot be rolled back). This command will also update the reported TCB to match that of the currently installed firmware. [ mdr: Note the reported TCB update in the documentation/commit. ] Signed-off-by: Tom Lendacky Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240126041126.1927228-25-michael.roth@amd.com --- include/linux/psp-sev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 84eabbfbbc08..3705c2044fc0 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -801,6 +801,15 @@ struct sev_platform_init_args { bool probe; }; +/** + * struct sev_data_snp_commit - SNP_COMMIT structure + * + * @len: length of the command buffer read by the PSP + */ +struct sev_data_snp_commit { + u32 len; +} __packed; + #ifdef CONFIG_CRYPTO_DEV_SP_PSP /** -- cgit v1.2.3 From ed0ef85795b58134172e8c82ab2f1b869cd501a6 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 29 Jan 2024 11:21:35 +0530 Subject: ASoC/soundwire: implement generic api for scanning amd soundwire controller Implement generic function for scanning SoundWire controller. Same function will be used for legacy and sof stack for AMD platforms. Signed-off-by: Vijendar Mukunda Acked-by: Vinod Koul Link: https://msgid.link/r/20240129055147.1493853-2-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_amd.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index ceecad74aef9..41dd64941cef 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -6,6 +6,7 @@ #ifndef __SDW_AMD_H #define __SDW_AMD_H +#include #include /* AMD pm_runtime quirk definitions */ @@ -106,4 +107,18 @@ struct amd_sdw_manager { struct sdw_amd_dai_runtime **dai_runtime_array; }; + +/** + * struct sdw_amd_acpi_info - Soundwire AMD information found in ACPI tables + * @handle: ACPI controller handle + * @count: maximum no of soundwire manager links supported on AMD platform. + * @link_mask: bit-wise mask listing links enabled by BIOS menu + */ +struct sdw_amd_acpi_info { + acpi_handle handle; + int count; + u32 link_mask; +}; + +int amd_sdw_scan_controller(struct sdw_amd_acpi_info *info); #endif -- cgit v1.2.3 From a47746428cf5762290d0c55f6ef82067af04d165 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 29 Jan 2024 11:21:36 +0530 Subject: soundwire: amd: update license Update license to dual license to align with Sound Open Firmware (SOF) driver as SOF uses dual license. Signed-off-by: Vijendar Mukunda Acked-by: Vinod Koul Link: https://msgid.link/r/20240129055147.1493853-3-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_amd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index 41dd64941cef..56b4117c087a 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. */ -- cgit v1.2.3 From ed5e8741b8db908d51a26e368c18573ee1b9e208 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 29 Jan 2024 11:21:37 +0530 Subject: soundwire: amd: refactor amd soundwire manager device node creation Refactor amd SoundWire manager device node creation logic and implement generic functions to have a common functionality for SoundWire manager platform device creation, start and exit sequence for both legacy(NO DSP) and SOF stack for AMD platforms. These functions will be invoked from legacy and SOF stack. Signed-off-by: Vijendar Mukunda Acked-by: Vinod Koul Link: https://msgid.link/r/20240129055147.1493853-4-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_amd.h | 56 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index 56b4117c087a..54735fa49759 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* - * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2023-24 Advanced Micro Devices, Inc. All rights reserved. */ #ifndef __SDW_AMD_H @@ -26,6 +26,7 @@ #define AMD_SDW_POWER_OFF_MODE 2 #define ACP_SDW0 0 #define ACP_SDW1 1 +#define AMD_SDW_MAX_MANAGER_COUNT 2 struct acp_sdw_pdata { u16 instance; @@ -63,7 +64,6 @@ struct sdw_amd_dai_runtime { * @reg_mask: register mask structure per manager instance * @amd_sdw_irq_thread: SoundWire manager irq workqueue * @amd_sdw_work: peripheral status work queue - * @probe_work: SoundWire manager probe workqueue * @acp_sdw_lock: mutex to protect acp share register access * @status: peripheral devices status array * @num_din_ports: number of input ports @@ -87,7 +87,6 @@ struct amd_sdw_manager { struct sdw_manager_reg_mask *reg_mask; struct work_struct amd_sdw_irq_thread; struct work_struct amd_sdw_work; - struct work_struct probe_work; /* mutex to protect acp common register access */ struct mutex *acp_sdw_lock; @@ -120,5 +119,56 @@ struct sdw_amd_acpi_info { u32 link_mask; }; +/** + * struct sdw_amd_ctx - context allocated by the controller driver probe + * + * @count: link count + * @num_slaves: total number of devices exposed across all enabled links + * @link_mask: bit-wise mask listing SoundWire links reported by the + * Controller + * @ids: array of slave_id, representing Slaves exposed across all enabled + * links + * @pdev: platform device structure + */ +struct sdw_amd_ctx { + int count; + int num_slaves; + u32 link_mask; + struct sdw_extended_slave_id *ids; + struct platform_device *pdev[AMD_SDW_MAX_MANAGER_COUNT]; +}; + +/** + * struct sdw_amd_res - Soundwire AMD global resource structure, + * typically populated by the DSP driver/Legacy driver + * + * @addr: acp pci device resource start address + * @reg_range: ACP register range + * @link_mask: bit-wise mask listing links selected by the DSP driver/ + * legacy driver + * @count: link count + * @mmio_base: mmio base of SoundWire registers + * @handle: ACPI parent handle + * @parent: parent device + * @dev: device implementing hwparams and free callbacks + * @acp_lock: mutex protecting acp common registers access + */ +struct sdw_amd_res { + u32 addr; + u32 reg_range; + u32 link_mask; + int count; + void __iomem *mmio_base; + acpi_handle handle; + struct device *parent; + struct device *dev; + /* use to protect acp common registers access */ + struct mutex *acp_lock; +}; + +int sdw_amd_probe(struct sdw_amd_res *res, struct sdw_amd_ctx **ctx); + +void sdw_amd_exit(struct sdw_amd_ctx *ctx); + int amd_sdw_scan_controller(struct sdw_amd_acpi_info *info); #endif -- cgit v1.2.3 From aff9d088a306541117e420d96ed6b6f1215a7e2d Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 29 Jan 2024 11:21:38 +0530 Subject: soundwire: amd: implement function to extract slave information Implement function to extract slaves information connected on the bus. This information is required during machine select logic. This function will be called from machine select logic code. Signed-off-by: Vijendar Mukunda Acked-by: Vinod Koul Link: https://msgid.link/r/20240129055147.1493853-5-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_amd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index 54735fa49759..9103772c2497 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -170,5 +170,7 @@ int sdw_amd_probe(struct sdw_amd_res *res, struct sdw_amd_ctx **ctx); void sdw_amd_exit(struct sdw_amd_ctx *ctx); +int sdw_amd_get_slave_info(struct sdw_amd_ctx *ctx); + int amd_sdw_scan_controller(struct sdw_amd_acpi_info *info); #endif -- cgit v1.2.3 From c1263c75294cc8178ca964e0220b35518d6fb38d Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Mon, 29 Jan 2024 11:21:40 +0530 Subject: soundwire: amd: refactor register mask structure Register mask array structure is no longer needed as except interrupt control masks, rest of the register masks are not used in code. Use array for interrupt masks instead of structure. Signed-off-by: Vijendar Mukunda Acked-by: Vinod Koul Link: https://msgid.link/r/20240129055147.1493853-7-Vijendar.Mukunda@amd.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_amd.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index 9103772c2497..28a4eb77717f 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -34,12 +34,6 @@ struct acp_sdw_pdata { struct mutex *acp_sdw_lock; }; -struct sdw_manager_reg_mask { - u32 sw_pad_enable_mask; - u32 sw_pad_pulldown_mask; - u32 acp_sdw_intr_mask; -}; - /** * struct sdw_amd_dai_runtime: AMD sdw dai runtime data * @@ -61,7 +55,6 @@ struct sdw_amd_dai_runtime { * @dev: linux device * @mmio: SoundWire registers mmio base * @acp_mmio: acp registers mmio base - * @reg_mask: register mask structure per manager instance * @amd_sdw_irq_thread: SoundWire manager irq workqueue * @amd_sdw_work: peripheral status work queue * @acp_sdw_lock: mutex to protect acp share register access @@ -84,7 +77,6 @@ struct amd_sdw_manager { void __iomem *mmio; void __iomem *acp_mmio; - struct sdw_manager_reg_mask *reg_mask; struct work_struct amd_sdw_irq_thread; struct work_struct amd_sdw_work; /* mutex to protect acp common register access */ -- cgit v1.2.3 From 7cbf7f4bf71a054d687c8860380c655a36d0f369 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 24 Jan 2024 18:13:18 +0530 Subject: dmaengine: ti: k3-udma-glue: Add function to request TX chan for thread ID The existing function k3_udma_glue_request_tx_chn() supports requesting a TX DMA channel by its name. Add a new function to request TX DMA channel for a given thread ID, named k3_udma_glue_request_tx_chn_for_thread_id(). Also, export it for use by drivers which are probed by alternate methods (non device-tree) but still wish to make use of the existing DMA APIs. Such drivers could be informed about the thread ID corresponding to the TX DMA channel by RPMsg for example. Since the new function k3_udma_glue_request_tx_chn_for_thread_id() reuses most of the code in k3_udma_glue_request_tx_chn(), create a new function for the common code, named k3_udma_glue_request_tx_chn_common(). Signed-off-by: Siddharth Vadapalli Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20240124124319.820002-4-s-vadapalli@ti.com Signed-off-by: Vinod Koul --- include/linux/dma/k3-udma-glue.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h index e443be4d3b4b..c81386ceb1c1 100644 --- a/include/linux/dma/k3-udma-glue.h +++ b/include/linux/dma/k3-udma-glue.h @@ -26,6 +26,11 @@ struct k3_udma_glue_tx_channel; struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev, const char *name, struct k3_udma_glue_tx_channel_cfg *cfg); +struct k3_udma_glue_tx_channel * +k3_udma_glue_request_tx_chn_for_thread_id(struct device *dev, + struct k3_udma_glue_tx_channel_cfg *cfg, + struct device_node *udmax_np, u32 thread_id); + void k3_udma_glue_release_tx_chn(struct k3_udma_glue_tx_channel *tx_chn); int k3_udma_glue_push_tx_chn(struct k3_udma_glue_tx_channel *tx_chn, struct cppi5_host_desc_t *desc_tx, -- cgit v1.2.3 From e54df52312fed462a005706d5d7ed6250da91d1e Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 24 Jan 2024 18:13:19 +0530 Subject: dmaengine: ti: k3-udma-glue: Add function to request RX chan for thread ID The existing function k3_udma_glue_request_remote_rx_chn() supports requesting an RX DMA channel and flow by the name of the RX DMA channel. Add support to request RX DMA channel for a given thread ID in the form of a new function named k3_udma_glue_request_remote_rx_chn_for_thread_id(). Also, export it for use by drivers which are probed by alternate methods (non device-tree) but still wish to make use of the existing DMA APIs. Such drivers could be informed about the thread ID corresponding to the RX DMA channel by RPMsg for example. Since the new function k3_udma_glue_request_remote_rx_chn_for_thread_id() reuses most of the code in k3_udma_glue_request_remote_rx_chn(), create a new function named k3_udma_glue_request_remote_rx_chn_common() for the common code. Signed-off-by: Siddharth Vadapalli Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20240124124319.820002-5-s-vadapalli@ti.com Signed-off-by: Vinod Koul --- include/linux/dma/k3-udma-glue.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h index c81386ceb1c1..1e491c5dcac2 100644 --- a/include/linux/dma/k3-udma-glue.h +++ b/include/linux/dma/k3-udma-glue.h @@ -114,6 +114,11 @@ struct k3_udma_glue_rx_channel *k3_udma_glue_request_rx_chn( const char *name, struct k3_udma_glue_rx_channel_cfg *cfg); +struct k3_udma_glue_rx_channel * +k3_udma_glue_request_remote_rx_chn_for_thread_id(struct device *dev, + struct k3_udma_glue_rx_channel_cfg *cfg, + struct device_node *udmax_np, u32 thread_id); + void k3_udma_glue_release_rx_chn(struct k3_udma_glue_rx_channel *rx_chn); int k3_udma_glue_enable_rx_chn(struct k3_udma_glue_rx_channel *rx_chn); void k3_udma_glue_disable_rx_chn(struct k3_udma_glue_rx_channel *rx_chn); -- cgit v1.2.3 From e2b3c4ff5d183da6d1863c2321413406a2752e7a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jan 2024 16:06:45 -0800 Subject: bpf: add __arg_trusted global func arg tag Add support for passing PTR_TO_BTF_ID registers to global subprogs. Currently only PTR_TRUSTED flavor of PTR_TO_BTF_ID is supported. Non-NULL semantics is assumed, so caller will be forced to prove PTR_TO_BTF_ID can't be NULL. Note, we disallow global subprogs to destroy passed in PTR_TO_BTF_ID arguments, even the trusted one. We achieve that by not setting ref_obj_id when validating subprog code. This basically enforces (in Rust terms) borrowing semantics vs move semantics. Borrowing semantics seems to be a better fit for isolated global subprog validation approach. Implementation-wise, we utilize existing logic for matching user-provided BTF type to kernel-side BTF type, used by BPF CO-RE logic and following same matching rules. We enforce a unique match for types. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240130000648.2144827-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7f5816482a10..0dcde339dc7e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -610,6 +610,7 @@ struct bpf_subprog_arg_info { enum bpf_arg_type arg_type; union { u32 mem_size; + u32 btf_id; }; }; -- cgit v1.2.3 From 8ddf54a32111f6dbe06cd318af443c6545a6c037 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 5 Jan 2024 10:42:53 -0700 Subject: bus: mhi: host: Read PK HASH dynamically The OEM PK HASH registers in the BHI region are read once during firmware load (boot), cached, and displayed on demand via sysfs. This has a few problems - if firmware load is skipped, the registers will not be read and if the register values change over the life of the device the local cache will be out of sync. Qualcomm Cloud AI 100 can expose both these problems. It is possible for mhi_async_power_up() to be invoked while the device is in AMSS EE, which would bypass firmware loading. Also, Qualcomm Cloud AI 100 has 5 PK HASH slots which can be dynamically provisioned while the device is active, which would result in the values changing and users may want to know what keys are active. Address these concerns by reading the PK HASH registers on-demand during the sysfs read. This will result in showing the most current information. Signed-off-by: Jeffrey Hugo Reviewed-by: Pranjal Ramajor Asha Kanojiya Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240105174253.863388-1-quic_jhugo@quicinc.com Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index d0f9b522f328..474d32cb0520 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -325,7 +325,6 @@ struct mhi_controller_config { * @major_version: MHI controller major revision number * @minor_version: MHI controller minor revision number * @serial_number: MHI controller serial number obtained from BHI - * @oem_pk_hash: MHI controller OEM PK Hash obtained from BHI * @mhi_event: MHI event ring configurations table * @mhi_cmd: MHI command ring configurations table * @mhi_ctxt: MHI device context, shared memory between host and device @@ -413,7 +412,6 @@ struct mhi_controller { u32 major_version; u32 minor_version; u32 serial_number; - u32 oem_pk_hash[MHI_MAX_OEM_PK_HASH_SEGMENTS]; struct mhi_event *mhi_event; struct mhi_cmd *mhi_cmd; -- cgit v1.2.3 From 1c9f2c7606afe149800986182638f636646dd824 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Jan 2024 08:28:16 -1000 Subject: kernfs: Rearrange kernfs_node fields to reduce its size on 64bit Moving .flags and .mode right below .hash makes kernfs_node smaller by 8 bytes on 64bit. To avoid creating a hole from 8 bytes alignment on 32bit archs, .priv is moved below so that there are two 32bit pointers after the 64bit .id field. v2: Updated to avoid size increase on 32bit noticed by Geert. Signed-off-by: Tejun Heo Cc: Geert Uytterhoeven Link: https://lore.kernel.org/r/ZZ7hwA18nfmFjYpj@slm.duckdns.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 99aaa050ccb7..82e1ce79a70c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -206,22 +206,22 @@ struct kernfs_node { const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ + unsigned short flags; + umode_t mode; + union { struct kernfs_elem_dir dir; struct kernfs_elem_symlink symlink; struct kernfs_elem_attr attr; }; - void *priv; - /* * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, * the low 32bits are ino and upper generation. */ u64 id; - unsigned short flags; - umode_t mode; + void *priv; struct kernfs_iattrs *iattr; }; -- cgit v1.2.3 From 4207b556e62f0a8915afc5da4c5d5ad915a253a5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2024 11:48:04 -1000 Subject: kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in kernfs_find_and_get_node_by_id() The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id() which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF programs including e.g. the ones that attach to functions which are holding the scheduler rq lock. Consider the following BPF program: SEC("fentry/__set_cpus_allowed_ptr_locked") int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p, struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf) { struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id); if (cgrp) { bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name); bpf_cgroup_release(cgrp); } return 0; } __set_cpus_allowed_ptr_locked() is called with rq lock held and the above BPF program calls bpf_cgroup_from_id() within leading to the following lockdep warning: ===================================================== WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected 6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted ----------------------------------------------------- repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70 and this task is already holding: ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0 which would create a new lock dependency: (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2} ... Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(kernfs_idr_lock); local_irq_disable(); lock(&rq->__lock); lock(kernfs_idr_lock); lock(&rq->__lock); *** DEADLOCK *** ... Call Trace: dump_stack_lvl+0x55/0x70 dump_stack+0x10/0x20 __lock_acquire+0x781/0x2a40 lock_acquire+0xbf/0x1f0 _raw_spin_lock+0x2f/0x40 kernfs_find_and_get_node_by_id+0x1e/0x70 cgroup_get_from_id+0x21/0x240 bpf_cgroup_from_id+0xe/0x20 bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a bpf_trampoline_6442545632+0x4f/0x1000 __set_cpus_allowed_ptr_locked+0x5/0x5a0 sched_setaffinity+0x1b3/0x290 __x64_sys_sched_setaffinity+0x4f/0x60 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e Let's fix it by protecting kernfs_node and kernfs_root with RCU and making kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of kernfs_idr_lock. This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit. Combined with the preceding rearrange patch, the net increase is 8 bytes. Signed-off-by: Tejun Heo Cc: Andrea Righi Cc: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 82e1ce79a70c..87c79d076d6d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -223,6 +223,8 @@ struct kernfs_node { void *priv; struct kernfs_iattrs *iattr; + + struct rcu_head rcu; }; /* -- cgit v1.2.3 From 3a480d4bb5b1e1f09426223e68acaa90da32e384 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Jan 2024 11:26:48 +0100 Subject: driver core: cpu: make cpu_subsys const Now that the driver core can properly handle constant struct bus_type, move the cpu_subsys variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/2024010548-crane-snooze-a871@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/cpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index dcb89c987164..0b993a140946 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -128,7 +128,7 @@ static inline void cpu_maps_update_done(void) static inline int add_cpu(unsigned int cpu) { return 0;} #endif /* CONFIG_SMP */ -extern struct bus_type cpu_subsys; +extern const struct bus_type cpu_subsys; extern int lockdep_is_cpus_held(void); -- cgit v1.2.3 From 4d5e86a56615cc387d21c629f9af8fb0e958d350 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2024 11:29:11 +0200 Subject: RDMA/mlx5: Fix fortify source warning while accessing Eth segment ------------[ cut here ]------------ memcpy: detected field-spanning write (size 56) of single field "eseg->inline_hdr.start" at /var/lib/dkms/mlnx-ofed-kernel/5.8/build/drivers/infiniband/hw/mlx5/wr.c:131 (size 2) WARNING: CPU: 0 PID: 293779 at /var/lib/dkms/mlnx-ofed-kernel/5.8/build/drivers/infiniband/hw/mlx5/wr.c:131 mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] Modules linked in: 8021q garp mrp stp llc rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) mlx5_core(OE) pci_hyperv_intf mlxdevm(OE) mlx_compat(OE) tls mlxfw(OE) psample nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables libcrc32c nfnetlink mst_pciconf(OE) knem(OE) vfio_pci vfio_pci_core vfio_iommu_type1 vfio iommufd irqbypass cuse nfsv3 nfs fscache netfs xfrm_user xfrm_algo ipmi_devintf ipmi_msghandler binfmt_misc crct10dif_pclmul crc32_pclmul polyval_clmulni polyval_generic ghash_clmulni_intel sha512_ssse3 snd_pcsp aesni_intel crypto_simd cryptd snd_pcm snd_timer joydev snd soundcore input_leds serio_raw evbug nfsd auth_rpcgss nfs_acl lockd grace sch_fq_codel sunrpc drm efi_pstore ip_tables x_tables autofs4 psmouse virtio_net net_failover failover floppy [last unloaded: mlx_compat(OE)] CPU: 0 PID: 293779 Comm: ssh Tainted: G OE 6.2.0-32-generic #32~22.04.1-Ubuntu Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] Code: 0c 01 00 a8 01 75 25 48 8b 75 a0 b9 02 00 00 00 48 c7 c2 10 5b fd c0 48 c7 c7 80 5b fd c0 c6 05 57 0c 03 00 01 e8 95 4d 93 da <0f> 0b 44 8b 4d b0 4c 8b 45 c8 48 8b 4d c0 e9 49 fb ff ff 41 0f b7 RSP: 0018:ffffb5b48478b570 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffb5b48478b628 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffb5b48478b5e8 R13: ffff963a3c609b5e R14: ffff9639c3fbd800 R15: ffffb5b480475a80 FS: 00007fc03b444c80(0000) GS:ffff963a3dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000556f46bdf000 CR3: 0000000006ac6003 CR4: 00000000003706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? show_regs+0x72/0x90 ? mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] ? __warn+0x8d/0x160 ? mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] ? report_bug+0x1bb/0x1d0 ? handle_bug+0x46/0x90 ? exc_invalid_op+0x19/0x80 ? asm_exc_invalid_op+0x1b/0x20 ? mlx5_ib_post_send+0x191b/0x1a60 [mlx5_ib] mlx5_ib_post_send_nodrain+0xb/0x20 [mlx5_ib] ipoib_send+0x2ec/0x770 [ib_ipoib] ipoib_start_xmit+0x5a0/0x770 [ib_ipoib] dev_hard_start_xmit+0x8e/0x1e0 ? validate_xmit_skb_list+0x4d/0x80 sch_direct_xmit+0x116/0x3a0 __dev_xmit_skb+0x1fd/0x580 __dev_queue_xmit+0x284/0x6b0 ? _raw_spin_unlock_irq+0xe/0x50 ? __flush_work.isra.0+0x20d/0x370 ? push_pseudo_header+0x17/0x40 [ib_ipoib] neigh_connected_output+0xcd/0x110 ip_finish_output2+0x179/0x480 ? __smp_call_single_queue+0x61/0xa0 __ip_finish_output+0xc3/0x190 ip_finish_output+0x2e/0xf0 ip_output+0x78/0x110 ? __pfx_ip_finish_output+0x10/0x10 ip_local_out+0x64/0x70 __ip_queue_xmit+0x18a/0x460 ip_queue_xmit+0x15/0x30 __tcp_transmit_skb+0x914/0x9c0 tcp_write_xmit+0x334/0x8d0 tcp_push_one+0x3c/0x60 tcp_sendmsg_locked+0x2e1/0xac0 tcp_sendmsg+0x2d/0x50 inet_sendmsg+0x43/0x90 sock_sendmsg+0x68/0x80 sock_write_iter+0x93/0x100 vfs_write+0x326/0x3c0 ksys_write+0xbd/0xf0 ? do_syscall_64+0x69/0x90 __x64_sys_write+0x19/0x30 do_syscall_64+0x59/0x90 ? do_user_addr_fault+0x1d0/0x640 ? exit_to_user_mode_prepare+0x3b/0xd0 ? irqentry_exit_to_user_mode+0x9/0x20 ? irqentry_exit+0x43/0x50 ? exc_page_fault+0x92/0x1b0 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fc03ad14a37 Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007ffdf8697fe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000008024 RCX: 00007fc03ad14a37 RDX: 0000000000008024 RSI: 0000556f46bd8270 RDI: 0000000000000003 RBP: 0000556f46bb1800 R08: 0000000000007fe3 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 R13: 0000556f46bc66b0 R14: 000000000000000a R15: 0000556f46bb2f50 ---[ end trace 0000000000000000 ]--- Link: https://lore.kernel.org/r/8228ad34bd1a25047586270f7b1fb4ddcd046282.1706433934.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/qp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index bd53cf4be7bd..f0e55bf3ec8b 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -269,7 +269,10 @@ struct mlx5_wqe_eth_seg { union { struct { __be16 sz; - u8 start[2]; + union { + u8 start[2]; + DECLARE_FLEX_ARRAY(u8, data); + }; } inline_hdr; struct { __be16 type; -- cgit v1.2.3 From 43fdbd140238d44e7e847232719fef7d20f9d326 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 28 Jan 2024 11:29:12 +0200 Subject: IB/mlx5: Don't expose debugfs entries for RRoCE general parameters if not supported debugfs entries for RRoCE general CC parameters must be exposed only when they are supported, otherwise when accessing them there may be a syndrome error in kernel log, for example: $ cat /sys/kernel/debug/mlx5/0000:08:00.1/cc_params/rtt_resp_dscp cat: '/sys/kernel/debug/mlx5/0000:08:00.1/cc_params/rtt_resp_dscp': Invalid argument $ dmesg mlx5_core 0000:08:00.1: mlx5_cmd_out_err:805:(pid 1253): QUERY_CONG_PARAMS(0x824) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x325a82), err(-22) Fixes: 66fb1d5df6ac ("IB/mlx5: Extend debug control for CC parameters") Reviewed-by: Edward Srouji Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/e7ade70bad52b7468bdb1de4d41d5fad70c8b71c.1706433934.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bf5320b28b8b..2c10350bd422 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1103,7 +1103,7 @@ struct mlx5_ifc_roce_cap_bits { u8 sw_r_roce_src_udp_port[0x1]; u8 fl_rc_qp_when_roce_disabled[0x1]; u8 fl_rc_qp_when_roce_enabled[0x1]; - u8 reserved_at_7[0x1]; + u8 roce_cc_general[0x1]; u8 qp_ooo_transmit_default[0x1]; u8 reserved_at_9[0x15]; u8 qp_ts_format[0x2]; -- cgit v1.2.3 From c5c3e1bfc9e0ee72af528df8d773980f4855938a Mon Sep 17 00:00:00 2001 From: Rohan G Thomas Date: Sat, 27 Jan 2024 12:04:41 +0800 Subject: net: stmmac: Offload queueMaxSDU from tc-taprio Add support for configuring queueMaxSDU. As DWMAC IPs doesn't support queueMaxSDU table handle this in the SW. The maximum 802.3 frame size that is allowed to be transmitted by any queue is queueMaxSDU + 16 bytes (i.e. 6 bytes SA + 6 bytes DA + 4 bytes FCS). Inspired from intel i225 driver. Signed-off-by: Rohan G Thomas Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index dee5ad6e48c5..dfa1828cd756 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -127,6 +127,7 @@ struct stmmac_est { u32 gcl_unaligned[EST_GCL]; u32 gcl[EST_GCL]; u32 gcl_size; + u32 max_sdu[MTL_MAX_TX_QUEUES]; }; struct stmmac_rxq_cfg { -- cgit v1.2.3 From d80a52335374e484a4ff2afdc9af843e73273945 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 27 Jan 2024 14:25:09 +0100 Subject: ethtool: replace struct ethtool_eee with a new struct ethtool_keee on kernel side In order to pass EEE link modes beyond bit 32 to userspace we have to complement the 32 bit bitmaps in struct ethtool_eee with linkmode bitmaps. Therefore, similar to ethtool_link_settings and ethtool_link_ksettings, add a struct ethtool_keee. In a first step it's an identical copy of ethtool_eee. This patch simply does a s/ethtool_eee/ethtool_keee/g for all users. No functional change intended. Suggested-by: Andrew Lunn Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/ethtool.h | 16 ++++++++++++++-- include/linux/phy.h | 8 ++++---- include/linux/phylink.h | 4 ++-- 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 325e0778e937..a850bab8489f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -222,6 +222,18 @@ extern int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); +struct ethtool_keee { + u32 cmd; + u32 supported; + u32 advertised; + u32 lp_advertised; + u32 eee_active; + u32 eee_enabled; + u32 tx_lpi_enabled; + u32 tx_lpi_timer; + u32 reserved[2]; +}; + struct kernel_ethtool_coalesce { u8 use_cqe_mode_tx; u8 use_cqe_mode_rx; @@ -892,8 +904,8 @@ struct ethtool_ops { struct ethtool_modinfo *); int (*get_module_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); - int (*get_eee)(struct net_device *, struct ethtool_eee *); - int (*set_eee)(struct net_device *, struct ethtool_eee *); + int (*get_eee)(struct net_device *dev, struct ethtool_keee *eee); + int (*set_eee)(struct net_device *dev, struct ethtool_keee *eee); int (*get_tunable)(struct net_device *, const struct ethtool_tunable *, void *); int (*set_tunable)(struct net_device *, diff --git a/include/linux/phy.h b/include/linux/phy.h index c9994a59ca2e..a66f07d3f5f4 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1908,9 +1908,9 @@ int genphy_c45_plca_get_status(struct phy_device *phydev, int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, unsigned long *lp, bool *is_enabled); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, - struct ethtool_eee *data); + struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, - struct ethtool_eee *data); + struct ethtool_keee *data); int genphy_c45_write_eee_adv(struct phy_device *phydev, unsigned long *adv); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); @@ -1988,8 +1988,8 @@ int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); -int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data); -int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data); +int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); +int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index d589f89c612c..6ba411732a0d 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -584,8 +584,8 @@ int phylink_ethtool_set_pauseparam(struct phylink *, struct ethtool_pauseparam *); int phylink_get_eee_err(struct phylink *); int phylink_init_eee(struct phylink *, bool); -int phylink_ethtool_get_eee(struct phylink *, struct ethtool_eee *); -int phylink_ethtool_set_eee(struct phylink *, struct ethtool_eee *); +int phylink_ethtool_get_eee(struct phylink *link, struct ethtool_keee *eee); +int phylink_ethtool_set_eee(struct phylink *link, struct ethtool_keee *eee); int phylink_mii_ioctl(struct phylink *, struct ifreq *, int); int phylink_speed_down(struct phylink *pl, bool sync); int phylink_speed_up(struct phylink *pl); -- cgit v1.2.3 From 285cc15cc555b4f05ebf2556bc6e85a6d36b790a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 27 Jan 2024 14:26:50 +0100 Subject: ethtool: adjust struct ethtool_keee to kernel needs This patch changes the following in struct ethtool_keee - remove member cmd, it's not needed on kernel side - remove reserved fields - switch the semantically boolean members to type bool We don't have to change any user of the boolean members due to the implicit casting from/to bool. A small change is needed where a pointer to bool members is used, in addition remove few now unneeded double negations. Reviewed-by: Andrew Lunn Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/ethtool.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a850bab8489f..14549cb9e2b2 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -223,15 +223,13 @@ __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); struct ethtool_keee { - u32 cmd; u32 supported; u32 advertised; u32 lp_advertised; - u32 eee_active; - u32 eee_enabled; - u32 tx_lpi_enabled; u32 tx_lpi_timer; - u32 reserved[2]; + bool tx_lpi_enabled; + bool eee_active; + bool eee_enabled; }; struct kernel_ethtool_coalesce { -- cgit v1.2.3 From 1d756ff13da6a2222ac4387511f2a0e2e83ce670 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 27 Jan 2024 14:28:47 +0100 Subject: ethtool: add suffix _u32 to legacy bitmap members of struct ethtool_keee This is in preparation of using the existing names for linkmode bitmaps. Suggested-by: Andrew Lunn Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/ethtool.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 14549cb9e2b2..89807c30f5a7 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -223,9 +223,9 @@ __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); struct ethtool_keee { - u32 supported; - u32 advertised; - u32 lp_advertised; + u32 supported_u32; + u32 advertised_u32; + u32 lp_advertised_u32; u32 tx_lpi_timer; bool tx_lpi_enabled; bool eee_active; -- cgit v1.2.3 From 1f069de63602e8d39d7d9fd6195f65235316f79a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 27 Jan 2024 14:29:33 +0100 Subject: ethtool: add linkmode bitmap support to struct ethtool_keee Add linkmode bitmap members to struct ethtool_keee, but keep the legacy u32 bitmaps for compatibility with existing drivers. Use linkmode "supported" not being empty as indicator that a user wants to use the linkmode bitmap members instead of the legacy bitmaps. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/ethtool.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 89807c30f5a7..b90c33607594 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -223,6 +223,9 @@ __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); struct ethtool_keee { + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); + __ETHTOOL_DECLARE_LINK_MODE_MASK(advertised); + __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertised); u32 supported_u32; u32 advertised_u32; u32 lp_advertised_u32; -- cgit v1.2.3 From f3a052391822b772b4e27f2594526cf1eb103cab Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:04:58 +0800 Subject: cpufreq: amd-pstate: Enable amd-pstate preferred core support amd-pstate driver utilizes the functions and data structures provided by the ITMT architecture to enable the scheduler to favor scheduling on cores which can be get a higher frequency with lower voltage. We call it amd-pstate preferrred core. Here sched_set_itmt_core_prio() is called to set priorities and sched_set_itmt_support() is called to enable ITMT feature. amd-pstate driver uses the highest performance value to indicate the priority of CPU. The higher value has a higher priority. The initial core rankings are set up by amd-pstate when the system boots. Add a variable hw_prefcore in cpudata structure. It will check if the processor and power firmware support preferred core feature. Add one new early parameter `disable` to allow user to disable the preferred core. Only when hardware supports preferred core and user set `enabled` in early parameter, amd pstate driver supports preferred core featue. Tested-by: Oleksandr Natalenko Reviewed-by: Huang Rui Reviewed-by: Wyes Karny Reviewed-by: Mario Limonciello Co-developed-by: Perry Yuan Signed-off-by: Perry Yuan Signed-off-by: Meng Li Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 6ad02ad9c7b4..68fc1bd8d851 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -52,6 +52,9 @@ struct amd_aperf_mperf { * @prev: Last Aperf/Mperf/tsc count value read from register * @freq: current cpu frequency value * @boost_supported: check whether the Processor or SBIOS supports boost mode + * @hw_prefcore: check whether HW supports preferred core featue. + * Only when hw_prefcore and early prefcore param are true, + * AMD P-State driver supports preferred core featue. * @epp_policy: Last saved policy used to set energy-performance preference * @epp_cached: Cached CPPC energy-performance preference value * @policy: Cpufreq policy value @@ -85,6 +88,7 @@ struct amd_cpudata { u64 freq; bool boost_supported; + bool hw_prefcore; /* EPP feature related attributes*/ s16 epp_policy; -- cgit v1.2.3 From 9c4a13a08a9b7afa4bc33f57675358f0195e302c Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:04:59 +0800 Subject: ACPI: cpufreq: Add highest perf change notification Platform firmware sends notify 0x85 to inform the OS that the highest performance of a CPU has changed. This will be used by the AMD P-state driver to update the ranking of preferred cores and set the priority of cores accordingly. Tested-by: Oleksandr Natalenko Reviewed-by: Mario Limonciello Reviewed-by: Huang Rui Reviewed-by: Perry Yuan Signed-off-by: Meng Li Link: https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-device-notification-values [ rjw: New subject, changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index afda5f24d3dd..9bebeec24abb 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -263,6 +263,7 @@ static inline bool cpufreq_supports_freq_invariance(void) return false; } static inline void disable_cpufreq(void) { } +static inline void cpufreq_update_limits(unsigned int cpu) { } #endif #ifdef CONFIG_CPU_FREQ_STAT -- cgit v1.2.3 From e571a5e2068ef57945fcd5d0fb950f8f96da6dc8 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Fri, 19 Jan 2024 17:05:00 +0800 Subject: cpufreq: amd-pstate: Update amd-pstate preferred core ranking dynamically Preferred core rankings can be changed dynamically by the platform based on the workload and platform conditions and accounting for thermals and aging. When this occurs, cpu priority need to be set. Tested-by: Oleksandr Natalenko Reviewed-by: Mario Limonciello Reviewed-by: Wyes Karny Reviewed-by: Huang Rui Reviewed-by: Perry Yuan Signed-off-by: Meng Li Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 68fc1bd8d851..d21838835abd 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -39,11 +39,16 @@ struct amd_aperf_mperf { * @cppc_req_cached: cached performance request hints * @highest_perf: the maximum performance an individual processor may reach, * assuming ideal conditions + * For platforms that do not support the preferred core feature, the + * highest_pef may be configured with 166 or 255, to avoid max frequency + * calculated wrongly. we take the fixed value as the highest_perf. * @nominal_perf: the maximum sustained performance level of the processor, * assuming ideal operating conditions * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power * savings are achieved * @lowest_perf: the absolute lowest performance level of the processor + * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher + * priority. * @max_freq: the frequency that mapped to highest_perf * @min_freq: the frequency that mapped to lowest_perf * @nominal_freq: the frequency that mapped to nominal_perf @@ -73,6 +78,7 @@ struct amd_cpudata { u32 nominal_perf; u32 lowest_nonlinear_perf; u32 lowest_perf; + u32 prefcore_ranking; u32 min_limit_perf; u32 max_limit_perf; u32 min_limit_freq; -- cgit v1.2.3 From 2a71528427c635f0a8bff704b2e62ce81c641d6f Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 28 Jan 2024 10:30:57 +0100 Subject: wifi: brcmfmac: fix copyright year mentioned in platform_data header The driver found its inception a little after the year 201. According git blame output it was added in 2016 so lets go with that. Fixes: 4d7928959832 ("brcmfmac: switch to new platform data") Reported-by: Dmitry Antipov Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://msgid.link/20240128093057.164791-3-arend.vanspriel@broadcom.com --- include/linux/platform_data/brcmfmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/brcmfmac.h b/include/linux/platform_data/brcmfmac.h index f922a192fe58..ec99b7b73d1d 100644 --- a/include/linux/platform_data/brcmfmac.h +++ b/include/linux/platform_data/brcmfmac.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 201 Broadcom Corporation + * Copyright (c) 2016 Broadcom Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above -- cgit v1.2.3 From e6c5812dc4d0b3e890608cb9c98597d1bed7e937 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Wed, 31 Jan 2024 11:07:27 -0600 Subject: spi: reorder spi_message struct member doc comments The members of `struct spi_message` were reordered in commit ae2ade4ba581 ("spi: Reorder fields in 'struct spi_message'") but the documentation comments were not updated to match. This commit updates the comments to match the new order. Signed-off-by: David Lechner Link: https://msgid.link/r/20240131170732.1665105-1-dlechner@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index f306aececeaf..29c3e4dd5d93 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1113,16 +1113,16 @@ struct spi_transfer { * @spi: SPI device to which the transaction is queued * @is_dma_mapped: if true, the caller provided both DMA and CPU virtual * addresses for each transfer buffer + * @prepared: spi_prepare_message was called for the this message + * @status: zero for success, else negative errno * @complete: called to report transaction completions * @context: the argument to complete() when it's called * @frame_length: the total number of bytes in the message * @actual_length: the total number of bytes that were transferred in all * successful segments - * @status: zero for success, else negative errno * @queue: for use by whichever driver currently owns the message * @state: for use by whichever driver currently owns the message * @resources: for resource management when the SPI message is processed - * @prepared: spi_prepare_message was called for the this message * * A @spi_message is used to execute an atomic sequence of data transfers, * each represented by a struct spi_transfer. The sequence is "atomic" -- cgit v1.2.3 From 79b47344bbc5a693a92ed6b2b09dac59254bfac8 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Sun, 28 Jan 2024 18:24:06 -0700 Subject: bpf: btf: Support flags for BTF_SET8 sets This commit adds support for flags on BTF_SET8s. struct btf_id_set8 already supported 32 bits worth of flags, but was only used for alignment purposes before. We now use these bits to encode flags. The first use case is tagging kfunc sets with a flag so that pahole can recognize which BTF_ID_FLAGS(func, ..) are actual kfuncs. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/7bb152ec76d6c2c930daec88e995bf18484a5ebb.1706491398.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index a9cb10b0e2e9..dca09b7f21dc 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -21,6 +21,7 @@ struct btf_id_set8 { #include /* for __PASTE */ #include /* for __maybe_unused */ +#include /* * Following macros help to define lists of BTF IDs placed @@ -183,17 +184,18 @@ extern struct btf_id_set name; * .word (1 << 3) | (1 << 1) | (1 << 2) * */ -#define __BTF_SET8_START(name, scope) \ +#define __BTF_SET8_START(name, scope, flags) \ +__BTF_ID_LIST(name, local) \ asm( \ ".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ "." #scope " __BTF_ID__set8__" #name "; \n" \ "__BTF_ID__set8__" #name ":; \n" \ -".zero 8 \n" \ +".zero 4 \n" \ +".long " __stringify(flags) "\n" \ ".popsection; \n"); #define BTF_SET8_START(name) \ -__BTF_ID_LIST(name, local) \ -__BTF_SET8_START(name, local) +__BTF_SET8_START(name, local, 0) #define BTF_SET8_END(name) \ asm( \ -- cgit v1.2.3 From a05e90427ef6706f59188b379ad6366b9d298bc5 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Sun, 28 Jan 2024 18:24:07 -0700 Subject: bpf: btf: Add BTF_KFUNCS_START/END macro pair This macro pair is functionally equivalent to BTF_SET8_START/END, except with BTF_SET8_KFUNCS flag set in the btf_id_set8 flags field. The next commit will codemod all kfunc set8s to this new variant such that all kfuncs are tagged as such in .BTF_ids section. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/d536c57c7c2af428686853cc7396b7a44faa53b7.1706491398.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index dca09b7f21dc..e24aabfe8ecc 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -8,6 +8,9 @@ struct btf_id_set { u32 ids[]; }; +/* This flag implies BTF_SET8 holds kfunc(s) */ +#define BTF_SET8_KFUNCS (1 << 0) + struct btf_id_set8 { u32 cnt; u32 flags; @@ -204,6 +207,12 @@ asm( \ ".popsection; \n"); \ extern struct btf_id_set8 name; +#define BTF_KFUNCS_START(name) \ +__BTF_SET8_START(name, local, BTF_SET8_KFUNCS) + +#define BTF_KFUNCS_END(name) \ +BTF_SET8_END(name) + #else #define BTF_ID_LIST(name) static u32 __maybe_unused name[64]; @@ -218,6 +227,8 @@ extern struct btf_id_set8 name; #define BTF_SET_END(name) #define BTF_SET8_START(name) static struct btf_id_set8 __maybe_unused name = { 0 }; #define BTF_SET8_END(name) +#define BTF_KFUNCS_START(name) static struct btf_id_set8 __maybe_unused name = { .flags = BTF_SET8_KFUNCS }; +#define BTF_KFUNCS_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ -- cgit v1.2.3 From 3723b56d6f73f7c8c3b521a80556f129830f6fb9 Mon Sep 17 00:00:00 2001 From: Philippe Schenker Date: Tue, 30 Jan 2024 09:34:19 +0100 Subject: net: dsa: Add KSZ8567 switch support This commit introduces support for the KSZ8567, a robust 7-port Ethernet switch. The KSZ8567 features two RGMII/MII/RMII interfaces, each capable of gigabit speeds, complemented by five 10/100 Mbps MAC/PHYs. Signed-off-by: Philippe Schenker Acked-by: Arun Ramadoss Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20240130083419.135763-2-dev@pschenker.ch Signed-off-by: Paolo Abeni --- include/linux/platform_data/microchip-ksz.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index f177416635a2..8c659db4da6b 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -33,6 +33,7 @@ enum ksz_chip_id { KSZ9897_CHIP_ID = 0x00989700, KSZ9893_CHIP_ID = 0x00989300, KSZ9563_CHIP_ID = 0x00956300, + KSZ8567_CHIP_ID = 0x00856700, KSZ9567_CHIP_ID = 0x00956700, LAN9370_CHIP_ID = 0x00937000, LAN9371_CHIP_ID = 0x00937100, -- cgit v1.2.3 From 4d2ff655fb85a0bf1ecec6022ffdacb2a5f83fd2 Mon Sep 17 00:00:00 2001 From: Lukasz Majczak Date: Fri, 26 Jan 2024 09:57:19 +0000 Subject: platform/chrome: Update binary interface for EC-based watchdog Update structures and defines related to EC_CMD_HANG_DETECT to allow usage of new EC-based watchdog. Signed-off-by: Lukasz Majczak Reviewed-by: Guenter Roeck Acked-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20240126095721.782782-2-lma@chromium.org Signed-off-by: Lee Jones --- include/linux/platform_data/cros_ec_commands.h | 78 ++++++++++++-------------- 1 file changed, 35 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index 7dae17b62a4d..ecc47d5fe239 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -3961,60 +3961,52 @@ struct ec_response_i2c_passthru { } __ec_align1; /*****************************************************************************/ -/* Power button hang detect */ - +/* AP hang detect */ #define EC_CMD_HANG_DETECT 0x009F -/* Reasons to start hang detection timer */ -/* Power button pressed */ -#define EC_HANG_START_ON_POWER_PRESS BIT(0) - -/* Lid closed */ -#define EC_HANG_START_ON_LID_CLOSE BIT(1) - - /* Lid opened */ -#define EC_HANG_START_ON_LID_OPEN BIT(2) - -/* Start of AP S3->S0 transition (booting or resuming from suspend) */ -#define EC_HANG_START_ON_RESUME BIT(3) - -/* Reasons to cancel hang detection */ +#define EC_HANG_DETECT_MIN_TIMEOUT 5 +#define EC_HANG_DETECT_MAX_TIMEOUT 65535 -/* Power button released */ -#define EC_HANG_STOP_ON_POWER_RELEASE BIT(8) +/* EC hang detect commands */ +enum ec_hang_detect_cmds { + /* Reload AP hang detect timer. */ + EC_HANG_DETECT_CMD_RELOAD = 0x0, -/* Any host command from AP received */ -#define EC_HANG_STOP_ON_HOST_COMMAND BIT(9) + /* Stop AP hang detect timer. */ + EC_HANG_DETECT_CMD_CANCEL = 0x1, -/* Stop on end of AP S0->S3 transition (suspending or shutting down) */ -#define EC_HANG_STOP_ON_SUSPEND BIT(10) + /* Configure watchdog with given reboot timeout and + * cancel currently running AP hang detect timer. + */ + EC_HANG_DETECT_CMD_SET_TIMEOUT = 0x2, -/* - * If this flag is set, all the other fields are ignored, and the hang detect - * timer is started. This provides the AP a way to start the hang timer - * without reconfiguring any of the other hang detect settings. Note that - * you must previously have configured the timeouts. - */ -#define EC_HANG_START_NOW BIT(30) + /* Get last hang status - whether the AP boot was clear or not */ + EC_HANG_DETECT_CMD_GET_STATUS = 0x3, -/* - * If this flag is set, all the other fields are ignored (including - * EC_HANG_START_NOW). This provides the AP a way to stop the hang timer - * without reconfiguring any of the other hang detect settings. - */ -#define EC_HANG_STOP_NOW BIT(31) + /* Clear last hang status. Called when AP is rebooting/shutting down + * gracefully. + */ + EC_HANG_DETECT_CMD_CLEAR_STATUS = 0x4 +}; struct ec_params_hang_detect { - /* Flags; see EC_HANG_* */ - uint32_t flags; - - /* Timeout in msec before generating host event, if enabled */ - uint16_t host_event_timeout_msec; + uint16_t command; /* enum ec_hang_detect_cmds */ + /* Timeout in seconds before generating reboot */ + uint16_t reboot_timeout_sec; +} __ec_align2; - /* Timeout in msec before generating warm reboot, if enabled */ - uint16_t warm_reboot_timeout_msec; -} __ec_align4; +/* Status codes that describe whether AP has boot normally or the hang has been + * detected and EC has reset AP + */ +enum ec_hang_detect_status { + EC_HANG_DETECT_AP_BOOT_NORMAL = 0x0, + EC_HANG_DETECT_AP_BOOT_EC_WDT = 0x1, + EC_HANG_DETECT_AP_BOOT_COUNT, +}; +struct ec_response_hang_detect { + uint8_t status; /* enum ec_hang_detect_status */ +} __ec_align1; /*****************************************************************************/ /* Commands for battery charging */ -- cgit v1.2.3 From c1f5204efcbcced83f67f12fa8f1a7f5f244fb87 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 28 Jan 2024 22:21:04 -0800 Subject: cpumask: add cpumask_weight_andnot() Similarly to cpumask_weight_and(), cpumask_weight_andnot() is a handy helper that may help to avoid creating an intermediate mask just to calculate number of bits that set in a 1st given mask, and clear in 2nd one. Signed-off-by: Yury Norov Reviewed-by: Jacob Keller Signed-off-by: Paolo Abeni --- include/linux/bitmap.h | 12 ++++++++++++ include/linux/cpumask.h | 13 +++++++++++++ 2 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 99451431e4d6..5814e9ee40ba 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -54,6 +54,7 @@ struct device; * bitmap_full(src, nbits) Are all bits set in *src? * bitmap_weight(src, nbits) Hamming Weight: number set bits * bitmap_weight_and(src1, src2, nbits) Hamming Weight of and'ed bitmap + * bitmap_weight_andnot(src1, src2, nbits) Hamming Weight of andnot'ed bitmap * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area @@ -169,6 +170,8 @@ bool __bitmap_subset(const unsigned long *bitmap1, unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); void __bitmap_set(unsigned long *map, unsigned int start, int len); void __bitmap_clear(unsigned long *map, unsigned int start, int len); @@ -425,6 +428,15 @@ unsigned long bitmap_weight_and(const unsigned long *src1, return __bitmap_weight_and(src1, src2, nbits); } +static __always_inline +unsigned long bitmap_weight_andnot(const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) + return hweight_long(*src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)); + return __bitmap_weight_andnot(src1, src2, nbits); +} + static __always_inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index cfb545841a2c..228c23eb36d2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -719,6 +719,19 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } +/** + * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2) + * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. + * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. + * + * Return: count of bits set in both *srcp1 and *srcp2 + */ +static inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1, + const struct cpumask *srcp2) +{ + return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); +} + /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result -- cgit v1.2.3 From dcee228078c34b63089c4b589d4bddf08019d0f6 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sun, 28 Jan 2024 22:21:05 -0800 Subject: cpumask: define cleanup function for cpumasks Now we can simplify code that allocates cpumasks for local needs. Signed-off-by: Yury Norov Signed-off-by: Paolo Abeni --- include/linux/cpumask.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 228c23eb36d2..1c29947db848 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -7,6 +7,7 @@ * set of CPUs in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ +#include #include #include #include @@ -990,6 +991,8 @@ static inline bool cpumask_available(cpumask_var_t mask) } #endif /* CONFIG_CPUMASK_OFFSTACK */ +DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T)); + /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); -- cgit v1.2.3 From 432acd550e3607d5fea23e27f6ab4e4567deccfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:26:59 +0100 Subject: iomap: move the io_folios field out of struct iomap_ioend The io_folios member in struct iomap_ioend counts the number of folios added to an ioend. It is only used at submission time and can thus be moved to iomap_writepage_ctx instead. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-4-hch@lst.de Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 96dd0acbba44..b2a05dff914d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -293,7 +293,6 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ - u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ @@ -329,6 +328,7 @@ struct iomap_writepage_ctx { struct iomap iomap; struct iomap_ioend *ioend; const struct iomap_writeback_ops *ops; + u32 nr_folios; /* folios added to the ioend */ }; void iomap_finish_ioends(struct iomap_ioend *ioend, int error); -- cgit v1.2.3 From ae5535efd8c445ad6033ac0d5da0197897b148ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:27:05 +0100 Subject: iomap: don't chain bios Back in the days when a single bio could only be filled to the hardware limits, and we scheduled a work item for each bio completion, chaining multiple bios for a single ioend made a lot of sense to reduce the number of completions. But these days bios can be filled until we reach the number of vectors or total size limit, which means we can always fit at least 1 megabyte worth of data in the worst case, but usually a lot more due to large folios. The only thing bio chaining is buying us now is to reduce the size of the allocation from an ioend with an embedded bio into a plain bio, which is a 52 bytes differences on 64-bit systems. This is not worth the added complexity, so remove the bio chaining and only use the bio embedded into the ioend. This will help to simplify further changes to the iomap writeback code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-10-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b2a05dff914d..b8d3b658ad2b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -297,10 +297,14 @@ struct iomap_ioend { size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ - struct bio *io_bio; /* bio being built */ - struct bio io_inline_bio; /* MUST BE LAST! */ + struct bio io_bio; /* MUST BE LAST! */ }; +static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio) +{ + return container_of(bio, struct iomap_ioend, io_bio); +} + struct iomap_writeback_ops { /* * Required, maps the blocks so that writeback can be performed on -- cgit v1.2.3 From 30deff8531f469453ccc0981f14eceb0a2ea68d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:27:09 +0100 Subject: iomap: map multiple blocks at a time The ->map_blocks interface returns a valid range for writeback, but we still call back into it for every block, which is a bit inefficient. Change iomap_writepage_map to use the valid range in the map until the end of the folio or the dirty range inside the folio instead of calling back into every block. Note that the range is not used over folio boundaries as we need to be able to check the mapping sequence count under the folio lock. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-14-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/iomap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b8d3b658ad2b..49d93f538785 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -309,6 +309,13 @@ struct iomap_writeback_ops { /* * Required, maps the blocks so that writeback can be performed on * the range starting at offset. + * + * Can return arbitrarily large regions, but we need to call into it at + * least once per folio to allow the file systems to synchronize with + * the write path that could be invalidating mappings. + * + * An existing mapping from a previous call to this method can be reused + * by the file system if it is still valid. */ int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset); -- cgit v1.2.3 From 19871b5c7a003946d3cd4209a348ab7c0df5dbad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:27:10 +0100 Subject: iomap: pass the length of the dirty region to ->map_blocks Let the file system know how much dirty data exists at the passed in offset. This allows file systems to allocate the right amount of space that actually is written back if they can't eagerly convert (e.g. because they don't support unwritten extents). Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-15-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 49d93f538785..6fc1c858013d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -318,7 +318,7 @@ struct iomap_writeback_ops { * by the file system if it is still valid. */ int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset); + loff_t offset, unsigned len); /* * Optional, allows the file systems to perform actions just before -- cgit v1.2.3 From e2ca9e75849e63eab6544549b6888595997e8153 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 30 Jan 2024 13:08:30 +0100 Subject: dpll: extend lock_status_get() op by status error and expose to user Pass additional argunent status_error over lock_status_get() so drivers can fill it up. In case they do, expose the value over previously introduced attribute to user. Do it only in case the current lock_status is either "unlocked" or "holdover". Signed-off-by: Jiri Pirko Acked-by: Vadim Fedorenko Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- include/linux/dpll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 9cf896ea1d41..9cb02ad73d51 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -19,6 +19,7 @@ struct dpll_device_ops { enum dpll_mode *mode, struct netlink_ext_ack *extack); int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv, enum dpll_lock_status *status, + enum dpll_lock_status_error *status_error, struct netlink_ext_ack *extack); int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv, s32 *temp, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 2c54a4d71246379f4ffb9beb6a780f9a49fdfc24 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 30 Jan 2024 13:08:31 +0100 Subject: net/mlx5: DPLL, Implement lock status error value Fill-up the lock status error value properly. Signed-off-by: Jiri Pirko Acked-by: Vadim Fedorenko Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- include/linux/mlx5/mlx5_ifc.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c726f90ab752..6c44f107b8ba 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -12705,6 +12705,14 @@ enum mlx5_msees_oper_status { MLX5_MSEES_OPER_STATUS_FAIL_FREE_RUNNING = 0x5, }; +enum mlx5_msees_failure_reason { + MLX5_MSEES_FAILURE_REASON_UNDEFINED_ERROR = 0x0, + MLX5_MSEES_FAILURE_REASON_PORT_DOWN = 0x1, + MLX5_MSEES_FAILURE_REASON_TOO_HIGH_FREQUENCY_DIFF = 0x2, + MLX5_MSEES_FAILURE_REASON_NET_SYNCHRONIZER_DEVICE_ERROR = 0x3, + MLX5_MSEES_FAILURE_REASON_LACK_OF_RESOURCES = 0x4, +}; + struct mlx5_ifc_msees_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; -- cgit v1.2.3 From 7092e9b3bed1252c7d3f5812b9fb9d82375b73a6 Mon Sep 17 00:00:00 2001 From: Kartik Date: Tue, 17 Oct 2023 10:53:15 +0530 Subject: mm/util: Introduce kmemdup_array() Introduce kmemdup_array() API to duplicate `n` number of elements from a given array. This internally uses kmemdup to allocate and duplicate the `src` array. Signed-off-by: Kartik Acked-by: Kees Cook Signed-off-by: Thierry Reding --- include/linux/string.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index ab148d8dbfc1..4795ee5c50c6 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -217,6 +217,7 @@ extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); +extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); -- cgit v1.2.3 From 66a5c40f60f5d88ad8d47ba6a4ba05892853fa1f Mon Sep 17 00:00:00 2001 From: Tanzir Hasan Date: Tue, 26 Dec 2023 18:00:00 +0000 Subject: kernel.h: removed REPEAT_BYTE from kernel.h This patch creates wordpart.h and includes it in asm/word-at-a-time.h for all architectures. WORD_AT_A_TIME_CONSTANTS depends on kernel.h because of REPEAT_BYTE. Moving this to another header and including it where necessary allows us to not include the bloated kernel.h. Making this implicit dependency on REPEAT_BYTE explicit allows for later improvements in the lib/string.c inclusion list. Suggested-by: Al Viro Suggested-by: Andy Shevchenko Signed-off-by: Tanzir Hasan Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231226-libstringheader-v6-1-80aa08c7652c@google.com Signed-off-by: Kees Cook --- include/linux/kernel.h | 8 -------- include/linux/wordpart.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 8 deletions(-) create mode 100644 include/linux/wordpart.h (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d9ad21058eed..f4a1d582b79d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -39,14 +39,6 @@ #define STACK_MAGIC 0xdeadbeef -/** - * REPEAT_BYTE - repeat the value @x multiple times as an unsigned long value - * @x: value to repeat - * - * NOTE: @x is not checked for > 0xff; larger values produce odd results. - */ -#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) - /* generic data direction definitions */ #define READ 0 #define WRITE 1 diff --git a/include/linux/wordpart.h b/include/linux/wordpart.h new file mode 100644 index 000000000000..c9e6bd773ebd --- /dev/null +++ b/include/linux/wordpart.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_WORDPART_H +#define _LINUX_WORDPART_H +/** + * REPEAT_BYTE - repeat the value @x multiple times as an unsigned long value + * @x: value to repeat + * + * NOTE: @x is not checked for > 0xff; larger values produce odd results. + */ +#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) + +#endif // _LINUX_WORDPART_H -- cgit v1.2.3 From 5b2dd77be1d85ac3a8be3749f5605bf0830e2998 Mon Sep 17 00:00:00 2001 From: Anjelique Melendez Date: Thu, 1 Feb 2024 12:44:24 -0800 Subject: soc: qcom: add QCOM PBS driver Add the Qualcomm PBS (Programmable Boot Sequencer) driver. The QCOM PBS driver supports configuring software PBS trigger events through PBS RAM on Qualcomm Technologies, Inc (QTI) PMICs. Signed-off-by: Anjelique Melendez Link: https://lore.kernel.org/r/20240201204421.16992-6-quic_amelende@quicinc.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/qcom-pbs.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 include/linux/soc/qcom/qcom-pbs.h (limited to 'include/linux') diff --git a/include/linux/soc/qcom/qcom-pbs.h b/include/linux/soc/qcom/qcom-pbs.h new file mode 100644 index 000000000000..8a46209ccf13 --- /dev/null +++ b/include/linux/soc/qcom/qcom-pbs.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _QCOM_PBS_H +#define _QCOM_PBS_H + +#include +#include + +struct device_node; +struct pbs_dev; + +#if IS_ENABLED(CONFIG_QCOM_PBS) +int qcom_pbs_trigger_event(struct pbs_dev *pbs, u8 bitmap); +struct pbs_dev *get_pbs_client_device(struct device *client_dev); +#else +static inline int qcom_pbs_trigger_event(struct pbs_dev *pbs, u8 bitmap) +{ + return -ENODEV; +} + +static inline struct pbs_dev *get_pbs_client_device(struct device *client_dev) +{ + return ERR_PTR(-ENODEV); +} +#endif + +#endif -- cgit v1.2.3 From 0e9876d8e88d81a35742e90048ab3784c49b910b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 31 Jan 2024 18:01:42 -0500 Subject: filelock: fl_pid field should be signed int This field has been unsigned for a very long time, but most users of the struct file_lock and the file locking internals themselves treat it as a signed value. Change it to be pid_t (which is a signed int). Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240131-flsplit-v3-1-c6129007ee8d@kernel.org Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/filelock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 95e868e09e29..085ff6ba0653 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -98,7 +98,7 @@ struct file_lock { fl_owner_t fl_owner; unsigned int fl_flags; unsigned char fl_type; - unsigned int fl_pid; + pid_t fl_pid; int fl_link_cpu; /* what cpu's list is this on? */ wait_queue_head_t fl_wait; struct file *fl_file; -- cgit v1.2.3 From cdefbf2324ceda662e2667aa2f44e8b9de3d780f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 25 Jan 2024 17:17:34 +0100 Subject: pidfd: cleanup the usage of __pidfd_prepare's flags - make pidfd_create() static. - Don't pass O_RDWR | O_CLOEXEC to __pidfd_prepare() in copy_process(), __pidfd_prepare() adds these flags unconditionally. - Kill the flags check in __pidfd_prepare(). sys_pidfd_open() checks the flags itself, all other users of pidfd_prepare() pass flags = 0. If we need a sanity check for those other in kernel users then WARN_ON_ONCE(flags & ~PIDFD_NONBLOCK) makes more sense. - Don't pass O_RDWR to get_unused_fd_flags(), it ignores everything except O_CLOEXEC. - Don't pass O_CLOEXEC to anon_inode_getfile(), it ignores everything except O_ACCMODE | O_NONBLOCK. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240125161734.GA778@redhat.com Signed-off-by: Christian Brauner --- include/linux/pid.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 395cacce1179..e6a041cb8bac 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -73,7 +73,6 @@ struct file; extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); -int pidfd_create(struct pid *pid, unsigned int flags); int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); static inline struct pid *get_pid(struct pid *pid) -- cgit v1.2.3 From 64bef697d33b75fc06c5789b3f8108680271529f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jan 2024 14:26:02 +0100 Subject: pidfd: implement PIDFD_THREAD flag for pidfd_open() With this flag: - pidfd_open() doesn't require that the target task must be a thread-group leader - pidfd_poll() succeeds when the task exits and becomes a zombie (iow, passes exit_notify()), even if it is a leader and thread-group is not empty. This means that the behaviour of pidfd_poll(PIDFD_THREAD, pid-of-group-leader) is not well defined if it races with exec() from its sub-thread; pidfd_poll() can succeed or not depending on whether pidfd_task_exited() is called before or after exchange_tids(). Perhaps we can improve this behaviour later, pidfd_poll() can probably take sig->group_exec_task into account. But this doesn't really differ from the case when the leader exits before other threads (so pidfd_poll() succeeds) and then another thread execs and pidfd_poll() will block again. thread_group_exited() is no longer used, perhaps it can die. Co-developed-by: Tycho Andersen Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com Tested-by: Tycho Andersen Reviewed-by: Tycho Andersen Signed-off-by: Christian Brauner --- include/linux/pid.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index e6a041cb8bac..8124d57752b9 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops; struct file; -extern struct pid *pidfd_pid(const struct file *file); +struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); +void do_notify_pidfd(struct task_struct *task); static inline struct pid *get_pid(struct pid *pid) { -- cgit v1.2.3 From b377252eeec91f347cd538011f956a4fe73794b3 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 30 Jan 2024 12:12:33 +0100 Subject: thermal: core: Change governor name to const char pointer All users are already assigning a const char * to the `governor_name` member of struct thermal_zone_params and to the `name` member of struct thermal_governor. Even if users are technically wrong, it just makes more sense to change this member to be a const char pointer instead of doing the other way around. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index b7a3deb372fd..65d8f92a9a0d 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -214,7 +214,7 @@ struct thermal_zone_device { * @governor_list: node in thermal_governor_list (in thermal_core.c) */ struct thermal_governor { - char name[THERMAL_NAME_LENGTH]; + const char *name; int (*bind_to_tz)(struct thermal_zone_device *tz); void (*unbind_from_tz)(struct thermal_zone_device *tz); int (*throttle)(struct thermal_zone_device *tz, @@ -226,7 +226,7 @@ struct thermal_governor { /* Structure to define Thermal Zone parameters */ struct thermal_zone_params { - char governor_name[THERMAL_NAME_LENGTH]; + const char *governor_name; /* * a boolean to indicate if the thermal to hwmon sysfs interface -- cgit v1.2.3 From 398ec3e925eb1c4d5850ec60f7075e0c20199003 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Dec 2023 10:02:46 +0100 Subject: init: Declare rodata_enabled and mark_rodata_ro() at all time Declaring rodata_enabled and mark_rodata_ro() at all time helps removing related #ifdefery in C files. Signed-off-by: Christophe Leroy Signed-off-by: Luis Chamberlain --- include/linux/init.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init.h b/include/linux/init.h index 3fa3f6241350..58cef4c2e59a 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -168,12 +168,8 @@ extern initcall_entry_t __initcall_end[]; extern struct file_system_type rootfs_fs_type; -#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX) extern bool rodata_enabled; -#endif -#ifdef CONFIG_STRICT_KERNEL_RWX void mark_rodata_ro(void); -#endif extern void (*late_time_init)(void); -- cgit v1.2.3 From c1e6148cb4f83cec841db1f066e8db4a86c1f118 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Sat, 27 Jan 2024 19:52:34 +0200 Subject: bpf: Preserve boundaries and track scalars on narrowing fill When the width of a fill is smaller than the width of the preceding spill, the information about scalar boundaries can still be preserved, as long as it's coerced to the right width (done by coerce_reg_to_size). Even further, if the actual value fits into the fill width, the ID can be preserved as well for further tracking of equal scalars. Implement the above improvements, which makes narrowing fills behave the same as narrowing spills and MOVs between registers. Two tests are adjusted to accommodate for endianness differences and to take into account that it's now allowed to do a narrowing fill from the least significant bits. reg_bounds_sync is added to coerce_reg_to_size to correctly adjust umin/umax boundaries after the var_off truncation, for example, a 64-bit value 0xXXXXXXXX00000000, when read as a 32-bit, gets umin = 0, umax = 0xFFFFFFFF, var_off = (0x0; 0xffffffff00000000), which needs to be synced down to umax = 0, otherwise reg_bounds_sanity_check doesn't pass. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240127175237.526726-4-maxtram95@gmail.com --- include/linux/bpf_verifier.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0dcde339dc7e..84365e6dd85d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -919,6 +919,15 @@ static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) env->scratched_stack_slots = ~0ULL; } +static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_size) +{ +#ifdef __BIG_ENDIAN + off -= spill_size - fill_size; +#endif + + return !(off % BPF_REG_SIZE); +} + const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); -- cgit v1.2.3 From 54ce1927eb787f7bbb7ee664841c8f5932703f39 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 31 Jan 2024 15:55:38 -0800 Subject: cxl/cper: Fix errant CPER prints for CXL events Jonathan reports that CXL CPER events dump an extra generic error message. {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 {1}[Hardware Error]: event severity: recoverable {1}[Hardware Error]: Error 0, type: recoverable {1}[Hardware Error]: section type: unknown, fbcd0a77-c260-417f-85a9-088b1621eba6 {1}[Hardware Error]: section length: 0x90 {1}[Hardware Error]: 00000000: 00000090 00000007 00000000 0d938086 ................ {1}[Hardware Error]: 00000010: 00100000 00000000 00040000 00000000 ................ ... CXL events were rerouted though the CXL subsystem for additional processing. However, when that work was done it was missed that cper_estatus_print_section() continued with a generic error message which is confusing. Teach CPER print code to ignore printing details of some section types. Assign the CXL event GUIDs to this set to prevent confusing unknown prints. Reported-by: Jonathan Cameron Suggested-by: Jonathan Cameron Signed-off-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Signed-off-by: Ard Biesheuvel --- include/linux/cper.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cper.h b/include/linux/cper.h index c1a7dc325121..265b0f8fc0b3 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -90,6 +90,29 @@ enum { GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ 0x72, 0x2D, 0xEB, 0x41) +/* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ +/* + * General Media Event Record + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CPER_SEC_CXL_GEN_MEDIA_GUID \ + GUID_INIT(0xfbcd0a77, 0xc260, 0x417f, \ + 0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6) +/* + * DRAM Event Record + * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 + */ +#define CPER_SEC_CXL_DRAM_GUID \ + GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, \ + 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24) +/* + * Memory Module Event Record + * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 + */ +#define CPER_SEC_CXL_MEM_MODULE_GUID \ + GUID_INIT(0xfe927475, 0xdd59, 0x4339, \ + 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74) + /* * Flags bits definitions for flags in struct cper_record_header * If set, the error has been recovered -- cgit v1.2.3 From 862cf85fef85becc55a173387527adb4f076fab0 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Wed, 31 Jan 2024 10:16:47 +0100 Subject: iio: commom: st_sensors: ensure proper DMA alignment Aligning the buffer to the L1 cache is not sufficient in some platforms as they might have larger cacheline sizes for caches after L1 and thus, we can't guarantee DMA safety. That was the whole reason to introduce IIO_DMA_MINALIGN in [1]. Do the same for st_sensors common buffer. While at it, moved the odr_lock before buffer_data as we definitely don't want any other data to share a cacheline with the buffer. [1]: https://lore.kernel.org/linux-iio/20220508175712.647246-2-jic23@kernel.org/ Fixes: e031d5f558f1 ("iio:st_sensors: remove buffer allocation at each buffer enable") Signed-off-by: Nuno Sa Cc: Link: https://lore.kernel.org/r/20240131-dev_dma_safety_stm-v2-1-580c07fae51b@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 607c3a89a647..f9ae5cdd884f 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -258,9 +258,9 @@ struct st_sensor_data { bool hw_irq_trigger; s64 hw_timestamp; - char buffer_data[ST_SENSORS_MAX_BUFFER_SIZE] ____cacheline_aligned; - struct mutex odr_lock; + + char buffer_data[ST_SENSORS_MAX_BUFFER_SIZE] __aligned(IIO_DMA_MINALIGN); }; #ifdef CONFIG_IIO_BUFFER -- cgit v1.2.3 From bd8a8d5ec5048ef74002d9f3db5cae971e68712c Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Fri, 2 Feb 2024 15:25:55 +0800 Subject: tun: Fix code style issues in This fixes the following code style problem: - WARNING: please, no spaces at the start of a line - CHECK: Please use a blank line after function/struct/union/enum declarations Signed-off-by: Yunjian Wang Reviewed-by: Jiri Pirko Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- include/linux/if_tun.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 2a7660843444..043d442994b0 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -27,44 +27,54 @@ struct tun_xdp_hdr { #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); + static inline bool tun_is_xdp_frame(void *ptr) { - return (unsigned long)ptr & TUN_XDP_FLAG; + return (unsigned long)ptr & TUN_XDP_FLAG; } + static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { - return (void *)((unsigned long)xdp | TUN_XDP_FLAG); + return (void *)((unsigned long)xdp | TUN_XDP_FLAG); } + static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { - return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); + return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); } + void tun_ptr_free(void *ptr); #else #include #include struct file; struct socket; + static inline struct socket *tun_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } + static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } + static inline bool tun_is_xdp_frame(void *ptr) { return false; } + static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } + static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } + static inline void tun_ptr_free(void *ptr) { } -- cgit v1.2.3 From ffabe98cb576097b77d404d39e8b3df03caa986a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Feb 2024 10:11:06 +0000 Subject: net: make dev_unreg_count global We can use a global dev_unreg_count counter instead of a per netns one. As a bonus we can factorize the changes done on it for bulk device removals. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 410529fca18b..21780608cf47 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -47,6 +47,7 @@ extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); extern wait_queue_head_t netdev_unregistering_wq; +extern atomic_t dev_unreg_count; extern struct rw_semaphore pernet_ops_rwsem; extern struct rw_semaphore net_rwsem; -- cgit v1.2.3 From 4cb1ef64609f9b0254184b2947824f4b46ccab22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 Feb 2024 11:28:06 -1000 Subject: workqueue: Implement BH workqueues to eventually replace tasklets The only generic interface to execute asynchronously in the BH context is tasklet; however, it's marked deprecated and has some design flaws such as the execution code accessing the tasklet item after the execution is complete which can lead to subtle use-after-free in certain usage scenarios and less-developed flush and cancel mechanisms. This patch implements BH workqueues which share the same semantics and features of regular workqueues but execute their work items in the softirq context. As there is always only one BH execution context per CPU, none of the concurrency management mechanisms applies and a BH workqueue can be thought of as a convenience wrapper around softirq. Except for the inability to sleep while executing and lack of max_active adjustments, BH workqueues and work items should behave the same as regular workqueues and work items. Currently, the execution is hooked to tasklet[_hi]. However, the goal is to convert all tasklet users over to BH workqueues. Once the conversion is complete, tasklet can be removed and BH workqueues can directly take over the tasklet softirqs. system_bh[_highpri]_wq are added. As queue-wide flushing doesn't exist in tasklet, all existing tasklet users should be able to use the system BH workqueues without creating their own workqueues. v3: - Add missing interrupt.h include. v2: - Instead of using tasklets, hook directly into its softirq action functions - tasklet[_hi]_action(). This is slightly cheaper and closer to the eventual code structure we want to arrive at. Suggested by Lai. - Lai also pointed out several places which need NULL worker->task handling or can use clarification. Updated. Signed-off-by: Tejun Heo Suggested-by: Linus Torvalds Link: http://lkml.kernel.org/r/CAHk-=wjDW53w4-YcSmgKC5RruiRLHmJ1sXeYdp_ZgVoBw=5byA@mail.gmail.com Tested-by: Allen Pais Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 232baea90a1d..283d7891b4c4 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -353,6 +353,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } * Documentation/core-api/workqueue.rst. */ enum wq_flags { + WQ_BH = 1 << 0, /* execute in bottom half (softirq) context */ WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ @@ -392,6 +393,9 @@ enum wq_flags { __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ + + /* BH wq only allows the following flags */ + __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, }; enum wq_consts { @@ -434,6 +438,9 @@ enum wq_consts { * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. + * + * system_bh[_highpri]_wq are convenience interface to softirq. BH work items + * are executed in the queueing CPU's BH context in the queueing order. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; @@ -442,6 +449,10 @@ extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; +extern struct workqueue_struct *system_bh_wq; +extern struct workqueue_struct *system_bh_highpri_wq; + +void workqueue_softirq_action(bool highpri); /** * alloc_workqueue - allocate a workqueue -- cgit v1.2.3 From d3b1a9a778e1a014c5331d1e8d4863fd999eb0b5 Mon Sep 17 00:00:00 2001 From: JonasZhou Date: Fri, 2 Feb 2024 16:33:04 +0800 Subject: fs/address_space: move i_mmap_rwsem to mitigate a false sharing with i_mmap. In the struct address_space, there is a 32-byte gap between i_mmap and i_mmap_rwsem. Due to the alignment of struct address_space variables to 8 bytes, in certain situations, i_mmap and i_mmap_rwsem may end up in the same CACHE line. While running Unixbench/execl, we observe high false sharing issues when accessing i_mmap against i_mmap_rwsem. We move i_mmap_rwsem after i_private_list, ensuring a 64-byte gap between i_mmap and i_mmap_rwsem. For Intel Silver machines (2 sockets) using kernel v6.8 rc-2, the score of Unixbench/execl improves by ~3.94%, and the score of Unixbench/shell improves by ~3.26%. Baseline: ------------------------------------------------------------- 162 546 748 11374 21 0xffff92e266af90c0 ------------------------------------------------------------- 46.89% 44.65% 0.00% 0.00% 0x0 1 1 0xffffffff86d5fb96 460 258 271 1069 32 [k] __handle_mm_fault [kernel.vmlinux] memory.c:2940 0 1 4.21% 4.41% 0.00% 0.00% 0x4 1 1 0xffffffff86d0ed54 473 311 288 95 28 [k] filemap_read [kernel.vmlinux] atomic.h:23 0 1 0.00% 0.00% 0.04% 4.76% 0x8 1 1 0xffffffff86d4bcf1 0 0 0 5 4 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:204 0 1 6.41% 6.02% 0.00% 0.00% 0x8 1 1 0xffffffff86d4ba85 411 271 339 210 32 [k] vma_interval_tree_insert [kernel.vmlinux] interval_tree.c:23 0 1 0.00% 0.00% 0.47% 95.24% 0x10 1 1 0xffffffff86d4bd34 0 0 0 74 32 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:339 0 1 0.37% 0.13% 0.00% 0.00% 0x10 1 1 0xffffffff86d4bb4f 328 212 380 7 5 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 5.13% 5.08% 0.00% 0.00% 0x10 1 1 0xffffffff86d4bb4b 416 255 357 197 32 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 1.10% 0.53% 0.00% 0.00% 0x28 1 1 0xffffffff86e06eb8 395 228 351 24 14 [k] do_dentry_open [kernel.vmlinux] open.c:966 0 1 1.10% 2.14% 57.07% 0.00% 0x38 1 1 0xffffffff878c9225 1364 792 462 7003 32 [k] down_write [kernel.vmlinux] atomic64_64.h:109 0 1 0.00% 0.00% 0.01% 0.00% 0x38 1 1 0xffffffff878c8e75 0 0 252 3 2 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:109 0 1 0.00% 0.13% 0.00% 0.00% 0x38 1 1 0xffffffff878c8e23 0 596 63 2 2 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:15 0 1 2.38% 2.94% 6.53% 0.00% 0x38 1 1 0xffffffff878c8ccb 1150 818 570 1197 32 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:109 0 1 30.59% 32.22% 0.00% 0.00% 0x38 1 1 0xffffffff878c8cb4 423 251 380 648 32 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:15 0 1 1.83% 1.74% 35.88% 0.00% 0x38 1 1 0xffffffff86b4f833 1217 1112 565 4586 32 [k] up_write [kernel.vmlinux] atomic64_64.h:91 0 1 with this change: ------------------------------------------------------------- 360 12 300 57 35 0xffff982cdae76400 ------------------------------------------------------------- 50.00% 59.67% 0.00% 0.00% 0x0 1 1 0xffffffff8215fb86 352 200 191 558 32 [k] __handle_mm_fault [kernel.vmlinux] memory.c:2940 0 1 8.33% 5.00% 0.00% 0.00% 0x4 1 1 0xffffffff8210ed44 370 284 263 42 24 [k] filemap_read [kernel.vmlinux] atomic.h:23 0 1 0.00% 0.00% 5.26% 2.86% 0x8 1 1 0xffffffff8214bce1 0 0 0 4 4 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:204 0 1 33.33% 14.33% 0.00% 0.00% 0x8 1 1 0xffffffff8214ba75 344 186 219 140 32 [k] vma_interval_tree_insert [kernel.vmlinux] interval_tree.c:23 0 1 0.00% 0.00% 94.74% 97.14% 0x10 1 1 0xffffffff8214bd24 0 0 0 88 29 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:339 0 1 8.33% 20.00% 0.00% 0.00% 0x10 1 1 0xffffffff8214bb3b 296 209 226 167 31 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 0.00% 0.67% 0.00% 0.00% 0x28 1 1 0xffffffff82206f45 0 140 334 4 3 [k] do_dentry_open [kernel.vmlinux] open.c:966 0 1 0.00% 0.33% 0.00% 0.00% 0x38 1 1 0xffffffff8250a6c4 0 286 126 5 5 [k] errseq_sample [kernel.vmlinux] errseq.c:125 0 Signed-off-by: JonasZhou Link: https://lore.kernel.org/r/20240202083304.10995-1-JonasZhou-oc@zhaoxin.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ebce4763b4bb..9efd6220b7c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -482,10 +482,10 @@ struct address_space { pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; - struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; + struct rw_semaphore i_mmap_rwsem; void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* -- cgit v1.2.3 From 75cabec0111b7ccb140d917cc9c481e845cc3498 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 31 Jan 2024 18:01:45 -0500 Subject: filelock: add some new helper functions In later patches we're going to embed some common fields into a new structure inside struct file_lock. Smooth the transition by adding some new helper functions, and converting the core file locking code to use them. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240131-flsplit-v3-4-c6129007ee8d@kernel.org Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/filelock.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 085ff6ba0653..a3cb59b7922a 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -147,6 +147,29 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int, int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); +static inline bool lock_is_unlock(struct file_lock *fl) +{ + return fl->fl_type == F_UNLCK; +} + +static inline bool lock_is_read(struct file_lock *fl) +{ + return fl->fl_type == F_RDLCK; +} + +static inline bool lock_is_write(struct file_lock *fl) +{ + return fl->fl_type == F_WRLCK; +} + +static inline void locks_wake_up(struct file_lock *fl) +{ + wake_up(&fl->fl_wait); +} + +/* for walking lists of file_locks linked by fl_list */ +#define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, fl_list) + /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); @@ -223,6 +246,27 @@ static inline int fcntl_getlease(struct file *filp) return F_UNLCK; } +static inline bool lock_is_unlock(struct file_lock *fl) +{ + return false; +} + +static inline bool lock_is_read(struct file_lock *fl) +{ + return false; +} + +static inline bool lock_is_write(struct file_lock *fl) +{ + return false; +} + +static inline void locks_wake_up(struct file_lock *fl) +{ +} + +#define for_each_file_lock(_fl, _head) while(false) + static inline void locks_free_lock_context(struct inode *inode) { -- cgit v1.2.3 From a69ce85ec9af6bdc0b3511959a7dc1a324e5e16a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 31 Jan 2024 18:01:58 -0500 Subject: filelock: split common fields into struct file_lock_core In a future patch, we're going to split file leases into their own structure. Since a lot of the underlying machinery uses the same fields move those into a new file_lock_core, and embed that inside struct file_lock. For now, add some macros to ensure that we can continue to build while the conversion is in progress. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240131-flsplit-v3-17-c6129007ee8d@kernel.org Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/filelock.h | 57 ++++++++++++++++++++++++++++++++--------------- include/linux/lockd/xdr.h | 3 ++- 2 files changed, 41 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index a3cb59b7922a..d1fba98744a7 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -85,23 +85,28 @@ bool opens_in_grace(struct net *); * * Obviously, the last two criteria only matter for POSIX locks. */ -struct file_lock { - struct file_lock *fl_blocker; /* The lock, that is blocking us */ - struct list_head fl_list; /* link into file_lock_context */ - struct hlist_node fl_link; /* node in global lists */ - struct list_head fl_blocked_requests; /* list of requests with + +struct file_lock_core { + struct file_lock *flc_blocker; /* The lock that is blocking us */ + struct list_head flc_list; /* link into file_lock_context */ + struct hlist_node flc_link; /* node in global lists */ + struct list_head flc_blocked_requests; /* list of requests with * ->fl_blocker pointing here */ - struct list_head fl_blocked_member; /* node in + struct list_head flc_blocked_member; /* node in * ->fl_blocker->fl_blocked_requests */ - fl_owner_t fl_owner; - unsigned int fl_flags; - unsigned char fl_type; - pid_t fl_pid; - int fl_link_cpu; /* what cpu's list is this on? */ - wait_queue_head_t fl_wait; - struct file *fl_file; + fl_owner_t flc_owner; + unsigned int flc_flags; + unsigned char flc_type; + pid_t flc_pid; + int flc_link_cpu; /* what cpu's list is this on? */ + wait_queue_head_t flc_wait; + struct file *flc_file; +}; + +struct file_lock { + struct file_lock_core c; loff_t fl_start; loff_t fl_end; @@ -126,6 +131,22 @@ struct file_lock { } fl_u; } __randomize_layout; +/* Temporary macros to allow building during coccinelle conversion */ +#ifdef _NEED_FILE_LOCK_FIELD_MACROS +#define fl_list c.flc_list +#define fl_blocker c.flc_blocker +#define fl_link c.flc_link +#define fl_blocked_requests c.flc_blocked_requests +#define fl_blocked_member c.flc_blocked_member +#define fl_owner c.flc_owner +#define fl_flags c.flc_flags +#define fl_type c.flc_type +#define fl_pid c.flc_pid +#define fl_link_cpu c.flc_link_cpu +#define fl_wait c.flc_wait +#define fl_file c.flc_file +#endif + struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; @@ -149,26 +170,26 @@ int fcntl_getlease(struct file *filp); static inline bool lock_is_unlock(struct file_lock *fl) { - return fl->fl_type == F_UNLCK; + return fl->c.flc_type == F_UNLCK; } static inline bool lock_is_read(struct file_lock *fl) { - return fl->fl_type == F_RDLCK; + return fl->c.flc_type == F_RDLCK; } static inline bool lock_is_write(struct file_lock *fl) { - return fl->fl_type == F_WRLCK; + return fl->c.flc_type == F_WRLCK; } static inline void locks_wake_up(struct file_lock *fl) { - wake_up(&fl->fl_wait); + wake_up(&fl->c.flc_wait); } /* for walking lists of file_locks linked by fl_list */ -#define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, fl_list) +#define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, c.flc_list) /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index b60fbcd8cdfa..a3f068b0ca86 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -11,6 +11,7 @@ #define LOCKD_XDR_H #include +#define _NEED_FILE_LOCK_FIELD_MACROS #include #include #include @@ -52,7 +53,7 @@ struct nlm_lock { * FreeBSD uses 16, Apple Mac OS X 10.3 uses 20. Therefore we set it to * 32 bytes. */ - + struct nlm_cookie { unsigned char data[NLM_MAXCOOKIELEN]; -- cgit v1.2.3 From b6aaba5b76e9596cb4d62d081cca41e114becacc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 31 Jan 2024 18:02:07 -0500 Subject: filelock: convert fl_blocker to file_lock_core Both locks and leases deal with fl_blocker. Switch the fl_blocker pointer in struct file_lock_core to point to the file_lock_core of the blocker instead of a file_lock structure. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240131-flsplit-v3-26-c6129007ee8d@kernel.org Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/filelock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index d1fba98744a7..701780734ae1 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -87,7 +87,7 @@ bool opens_in_grace(struct net *); */ struct file_lock_core { - struct file_lock *flc_blocker; /* The lock that is blocking us */ + struct file_lock_core *flc_blocker; /* The lock that is blocking us */ struct list_head flc_list; /* link into file_lock_context */ struct hlist_node flc_link; /* node in global lists */ struct list_head flc_blocked_requests; /* list of requests with -- cgit v1.2.3 From eb8ed7c6ab08cde2e8869adc72cc02c7368f0a21 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 31 Jan 2024 18:02:21 -0500 Subject: lockd: adapt to breakup of struct file_lock Most of the existing APIs have remained the same, but subsystems that access file_lock fields directly need to reach into struct file_lock_core now. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240131-flsplit-v3-40-c6129007ee8d@kernel.org Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/lockd/lockd.h | 8 ++++---- include/linux/lockd/xdr.h | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 9f565416d186..1b95fe31051f 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -375,12 +375,12 @@ static inline int nlm_privileged_requester(const struct svc_rqst *rqstp) static inline int nlm_compare_locks(const struct file_lock *fl1, const struct file_lock *fl2) { - return file_inode(fl1->fl_file) == file_inode(fl2->fl_file) - && fl1->fl_pid == fl2->fl_pid - && fl1->fl_owner == fl2->fl_owner + return file_inode(fl1->c.flc_file) == file_inode(fl2->c.flc_file) + && fl1->c.flc_pid == fl2->c.flc_pid + && fl1->c.flc_owner == fl2->c.flc_owner && fl1->fl_start == fl2->fl_start && fl1->fl_end == fl2->fl_end - &&(fl1->fl_type == fl2->fl_type || fl2->fl_type == F_UNLCK); + &&(fl1->c.flc_type == fl2->c.flc_type || fl2->c.flc_type == F_UNLCK); } extern const struct lock_manager_operations nlmsvc_lock_operations; diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index a3f068b0ca86..80cca9426761 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -11,7 +11,6 @@ #define LOCKD_XDR_H #include -#define _NEED_FILE_LOCK_FIELD_MACROS #include #include #include -- cgit v1.2.3 From 282c30f320ba25794b66c2231ab134d15465ef21 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 31 Jan 2024 18:02:27 -0500 Subject: filelock: remove temporary compatibility macros Everything has been converted to access fl_core fields directly, so we can now drop these. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240131-flsplit-v3-46-c6129007ee8d@kernel.org Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/filelock.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 701780734ae1..0606e806da76 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -131,22 +131,6 @@ struct file_lock { } fl_u; } __randomize_layout; -/* Temporary macros to allow building during coccinelle conversion */ -#ifdef _NEED_FILE_LOCK_FIELD_MACROS -#define fl_list c.flc_list -#define fl_blocker c.flc_blocker -#define fl_link c.flc_link -#define fl_blocked_requests c.flc_blocked_requests -#define fl_blocked_member c.flc_blocked_member -#define fl_owner c.flc_owner -#define fl_flags c.flc_flags -#define fl_type c.flc_type -#define fl_pid c.flc_pid -#define fl_link_cpu c.flc_link_cpu -#define fl_wait c.flc_wait -#define fl_file c.flc_file -#endif - struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; -- cgit v1.2.3 From c69ff4071935f946f1cddc59e1d36a03442ed015 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 31 Jan 2024 18:02:28 -0500 Subject: filelock: split leases out of struct file_lock Add a new struct file_lease and move the lease-specific fields from struct file_lock to it. Convert the appropriate API calls to take struct file_lease instead, and convert the callers to use them. There is zero overlap between the lock manager operations for file locks and the ones for file leases, so split the lease-related operations off into a new lease_manager_operations struct. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240131-flsplit-v3-47-c6129007ee8d@kernel.org Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/filelock.h | 49 ++++++++++++++++++++++++++++++++---------------- include/linux/fs.h | 5 +++-- 2 files changed, 36 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 0606e806da76..553d65a88048 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -27,6 +27,7 @@ #define FILE_LOCK_DEFERRED 1 struct file_lock; +struct file_lease; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); @@ -39,14 +40,17 @@ struct lock_manager_operations { void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); - bool (*lm_break)(struct file_lock *); - int (*lm_change)(struct file_lock *, int, struct list_head *); - void (*lm_setup)(struct file_lock *, void **); - bool (*lm_breaker_owns_lease)(struct file_lock *); bool (*lm_lock_expirable)(struct file_lock *cfl); void (*lm_expire_lock)(void); }; +struct lease_manager_operations { + bool (*lm_break)(struct file_lease *); + int (*lm_change)(struct file_lease *, int, struct list_head *); + void (*lm_setup)(struct file_lease *, void **); + bool (*lm_breaker_owns_lease)(struct file_lease *); +}; + struct lock_manager { struct list_head list; /* @@ -110,11 +114,6 @@ struct file_lock { loff_t fl_start; loff_t fl_end; - struct fasync_struct * fl_fasync; /* for lease break notifications */ - /* for lease breaks: */ - unsigned long fl_break_time; - unsigned long fl_downgrade_time; - const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { @@ -131,6 +130,15 @@ struct file_lock { } fl_u; } __randomize_layout; +struct file_lease { + struct file_lock_core c; + struct fasync_struct * fl_fasync; /* for lease break notifications */ + /* for lease breaks: */ + unsigned long fl_break_time; + unsigned long fl_downgrade_time; + const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */ +} __randomize_layout; + struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; @@ -179,7 +187,7 @@ static inline void locks_wake_up(struct file_lock *fl) void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); void locks_init_lock(struct file_lock *); -struct file_lock * locks_alloc_lock(void); +struct file_lock *locks_alloc_lock(void); void locks_copy_lock(struct file_lock *, struct file_lock *); void locks_copy_conflock(struct file_lock *, struct file_lock *); void locks_remove_posix(struct file *, fl_owner_t); @@ -193,11 +201,15 @@ int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_l int vfs_cancel_lock(struct file *filp, struct file_lock *fl); bool vfs_inode_has_locks(struct inode *inode); int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); + +void locks_init_lease(struct file_lease *); +void locks_free_lease(struct file_lease *fl); +struct file_lease *locks_alloc_lease(void); int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); void lease_get_mtime(struct inode *, struct timespec64 *time); -int generic_setlease(struct file *, int, struct file_lock **, void **priv); -int vfs_setlease(struct file *, int, struct file_lock **, void **); -int lease_modify(struct file_lock *, int, struct list_head *); +int generic_setlease(struct file *, int, struct file_lease **, void **priv); +int vfs_setlease(struct file *, int, struct file_lease **, void **); +int lease_modify(struct file_lease *, int, struct list_head *); struct notifier_block; int lease_register_notifier(struct notifier_block *); @@ -282,6 +294,11 @@ static inline void locks_init_lock(struct file_lock *fl) return; } +static inline void locks_init_lease(struct file_lease *fl) +{ + return; +} + static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; @@ -356,18 +373,18 @@ static inline void lease_get_mtime(struct inode *inode, } static inline int generic_setlease(struct file *filp, int arg, - struct file_lock **flp, void **priv) + struct file_lease **flp, void **priv) { return -EINVAL; } static inline int vfs_setlease(struct file *filp, int arg, - struct file_lock **lease, void **priv) + struct file_lease **lease, void **priv) { return -EINVAL; } -static inline int lease_modify(struct file_lock *fl, int arg, +static inline int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { return -EINVAL; diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..162877197bf1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1064,6 +1064,7 @@ struct file *get_file_active(struct file **f); typedef void *fl_owner_t; struct file_lock; +struct file_lease; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX @@ -2005,7 +2006,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); - int (*setlease)(struct file *, int, struct file_lock **, void **); + int (*setlease)(struct file *, int, struct file_lease **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); @@ -3238,7 +3239,7 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping, extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); -extern int simple_nosetlease(struct file *, int, struct file_lock **, void **); +extern int simple_nosetlease(struct file *, int, struct file_lease **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); -- cgit v1.2.3 From 7b8001013d720c232ad9ae7aae0ef0e7c281c6d4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 5 Feb 2024 07:09:31 -0500 Subject: filelock: don't do security checks on nfsd setlease calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zdenek reported seeing some AVC denials due to nfsd trying to set delegations: type=AVC msg=audit(09.11.2023 09:03:46.411:496) : avc: denied { lease } for pid=5127 comm=rpc.nfsd capability=lease scontext=system_u:system_r:nfsd_t:s0 tcontext=system_u:system_r:nfsd_t:s0 tclass=capability permissive=0 When setting delegations on behalf of nfsd, we don't want to do all of the normal capabilty and LSM checks. nfsd is a kernel thread and runs with CAP_LEASE set, so the uid checks end up being a no-op in most cases anyway. Some nfsd functions can end up running in normal process context when tearing down the server. At that point, the CAP_LEASE check can fail and cause the client to not tear down delegations when expected. Also, the way the per-fs ->setlease handlers work today is a little convoluted. The non-trivial ones are wrappers around generic_setlease, so when they fail due to permission problems they usually they end up doing a little extra work only to determine that they can't set the lease anyway. It would be more efficient to do those checks earlier. Transplant the permission checking from generic_setlease to vfs_setlease, which will make the permission checking happen earlier on filesystems that have a ->setlease operation. Add a new kernel_setlease function that bypasses these checks, and switch nfsd to use that instead of vfs_setlease. There is one behavioral change here: prior this patch the setlease_notifier would fire even if the lease attempt was going to fail the security checks later. With this change, it doesn't fire until the caller has passed them. I think this is a desirable change overall. nfsd is the only user of the setlease_notifier and it doesn't benefit from being notified about failed attempts. Cc: Ondrej Mosnáček Reported-by: Zdenek Pytela Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2248830 Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240205-bz2248830-v1-1-d0ec0daecba1@kernel.org Acked-by: Tom Talpey Reviewed-by: NeilBrown Signed-off-by: Christian Brauner --- include/linux/filelock.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 553d65a88048..aabd4bdf7eba 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -208,6 +208,7 @@ struct file_lease *locks_alloc_lease(void); int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); +int kernel_setlease(struct file *, int, struct file_lease **, void **); int vfs_setlease(struct file *, int, struct file_lease **, void **); int lease_modify(struct file_lease *, int, struct list_head *); @@ -378,6 +379,12 @@ static inline int generic_setlease(struct file *filp, int arg, return -EINVAL; } +static inline int kernel_setlease(struct file *filp, int arg, + struct file_lease **lease, void **priv) +{ + return -EINVAL; +} + static inline int vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { -- cgit v1.2.3 From bc88528cda2eddc3e5ea304fc3f147f1b4186aa4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:09:44 +0100 Subject: PM: sleep: stats: Use array of suspend step names Replace suspend_step_name() in the suspend statistics code with an array of suspend step names which has fewer lines of code and less overhead. While at it, remove two unnecessary line breaks in suspend_stats_show() and adjust some white space in there to the kernel coding style for a more consistent code layout. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- include/linux/suspend.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index ef503088942d..58f7352af205 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -41,7 +41,8 @@ typedef int __bitwise suspend_state_t; #define PM_SUSPEND_MAX ((__force suspend_state_t) 4) enum suspend_stat_step { - SUSPEND_FREEZE = 1, + SUSPEND_WORKING = 0, + SUSPEND_FREEZE, SUSPEND_PREPARE, SUSPEND_SUSPEND, SUSPEND_SUSPEND_LATE, -- cgit v1.2.3 From 34a956739d295de6010cdaafeed698ccbba87ea4 Mon Sep 17 00:00:00 2001 From: Ezra Buehler Date: Thu, 25 Jan 2024 22:01:07 +0200 Subject: mtd: spinand: Add support for 5-byte IDs E.g. ESMT chips will return an identification code with a length of 5 bytes. In order to prevent ambiguity, flash chips would actually need to return IDs that are up to 17 or more bytes long due to JEDEC's continuation scheme. I understand that if a manufacturer ID is located in bank N of JEDEC's database (there are currently 16 banks), N - 1 continuation codes (7Fh) need to be added to the identification code (comprising of manufacturer ID and device ID). However, most flash chip manufacturers don't seem to implement this (correctly). Signed-off-by: Ezra Buehler Reviewed-by: Martin Kurbanov Tested-by: Martin Kurbanov Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240125200108.24374-2-ezra@easyb.ch --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index badb4c1ac079..5c19ead60499 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -169,7 +169,7 @@ struct spinand_op; struct spinand_device; -#define SPINAND_MAX_ID_LEN 4 +#define SPINAND_MAX_ID_LEN 5 /* * For erase, write and read operation, we got the following timings : * tBERS (erase) 1ms to 4ms -- cgit v1.2.3 From b730bab0b9c4204d7dda3f5bc8adf4292497fc39 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:11:57 +0100 Subject: PM: sleep: stats: Use an array of step failure counters Instead of using a set of individual struct suspend_stats fields representing suspend step failure counters, use an array of counters indexed by enum suspend_stat_step for this purpose, which allows dpm_save_failed_step() to increment the appropriate counter automatically, so that its callers don't need to do that directly. It also allows suspend_stats_show() to carry out a loop over the counters array to print their values. Because the counters cannot become negative, use unsigned int for representing them. The only user-observable impact of this change is a different ordering of entries in the suspend_stats debugfs file which is not expected to matter. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- include/linux/suspend.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 58f7352af205..5e4c4d4aed95 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -52,17 +52,12 @@ enum suspend_stat_step { SUSPEND_RESUME }; +#define SUSPEND_NR_STEPS SUSPEND_RESUME + struct suspend_stats { + unsigned int step_failures[SUSPEND_NR_STEPS]; int success; int fail; - int failed_freeze; - int failed_prepare; - int failed_suspend; - int failed_suspend_late; - int failed_suspend_noirq; - int failed_resume; - int failed_resume_early; - int failed_resume_noirq; #define REC_FAILED_NUM 2 int last_failed_dev; char failed_devs[REC_FAILED_NUM][40]; @@ -95,6 +90,7 @@ static inline void dpm_save_failed_errno(int err) static inline void dpm_save_failed_step(enum suspend_stat_step step) { + suspend_stats.step_failures[step-1]++; suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; suspend_stats.last_failed_step++; suspend_stats.last_failed_step %= REC_FAILED_NUM; -- cgit v1.2.3 From 2231f78d3e15e45abe534db1997bc6a2153dc01c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:13:14 +0100 Subject: PM: sleep: stats: Use unsigned int for success and failure counters Change the type of the "success" and "fail" fields in struct suspend_stats to unsigned int, because they cannot be negative. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Ulf Hansson --- include/linux/suspend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 5e4c4d4aed95..216bae989535 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -56,8 +56,8 @@ enum suspend_stat_step { struct suspend_stats { unsigned int step_failures[SUSPEND_NR_STEPS]; - int success; - int fail; + unsigned int success; + unsigned int fail; #define REC_FAILED_NUM 2 int last_failed_dev; char failed_devs[REC_FAILED_NUM][40]; -- cgit v1.2.3 From 18af7e357033f1a1cee50db2663ef982b4a2226e Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Mon, 5 Feb 2024 12:09:55 +0200 Subject: mtd: flashchip: explicitly include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While reviewing the hyperbus sfdp proposal the following problem was noticed: In file included from ./include/linux/mtd/gen_probe.h:10, from drivers/mtd/hyperbus/hyperbus-sfdp.c:6: ./include/linux/mtd/flashchip.h:77:9: error: unknown type name ‘wait_queue_head_t’ 77 | wait_queue_head_t wq; /* Wait on here when we're waiting for the chip | ^~~~~~~~~~~~~~~~~ It is good practice to directly include all headers used, it avoids implicit dependencies and spurious breakage if someone rearranges headers and causes the implicit include to vanish. Explicitly include in include/linux/mtd/flashchip. Signed-off-by: Tudor Ambarus Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240205100955.149755-1-tudor.ambarus@linaro.org --- include/linux/mtd/flashchip.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h index c04f690871ca..9798c1a1d3b6 100644 --- a/include/linux/mtd/flashchip.h +++ b/include/linux/mtd/flashchip.h @@ -13,6 +13,7 @@ */ #include #include +#include typedef enum { FL_READY, -- cgit v1.2.3 From 9ff544fa5f94fe07f99a36d2138075b322067546 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jan 2024 17:30:44 +0100 Subject: PM: sleep: stats: Define suspend_stats next to the code using it It is not necessary to define struct suspend_stats in a header file and the suspend_stats variable in the core device system-wide PM code. They both can be defined in kernel/power/main.c, next to the sysfs and debugfs code accessing suspend_stats, which can be static. Modify the code in question in accordance with the above observation and replace the static inline functions manipulating suspend_stats with regular ones defined in kernel/power/main.c. While at it, move the enum suspend_stat_step to the end of suspend.h which is a more suitable place for it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson --- include/linux/suspend.h | 71 +++++++++++-------------------------------------- 1 file changed, 15 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 216bae989535..da6ebca3ff77 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -40,62 +40,6 @@ typedef int __bitwise suspend_state_t; #define PM_SUSPEND_MIN PM_SUSPEND_TO_IDLE #define PM_SUSPEND_MAX ((__force suspend_state_t) 4) -enum suspend_stat_step { - SUSPEND_WORKING = 0, - SUSPEND_FREEZE, - SUSPEND_PREPARE, - SUSPEND_SUSPEND, - SUSPEND_SUSPEND_LATE, - SUSPEND_SUSPEND_NOIRQ, - SUSPEND_RESUME_NOIRQ, - SUSPEND_RESUME_EARLY, - SUSPEND_RESUME -}; - -#define SUSPEND_NR_STEPS SUSPEND_RESUME - -struct suspend_stats { - unsigned int step_failures[SUSPEND_NR_STEPS]; - unsigned int success; - unsigned int fail; -#define REC_FAILED_NUM 2 - int last_failed_dev; - char failed_devs[REC_FAILED_NUM][40]; - int last_failed_errno; - int errno[REC_FAILED_NUM]; - int last_failed_step; - u64 last_hw_sleep; - u64 total_hw_sleep; - u64 max_hw_sleep; - enum suspend_stat_step failed_steps[REC_FAILED_NUM]; -}; - -extern struct suspend_stats suspend_stats; - -static inline void dpm_save_failed_dev(const char *name) -{ - strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], - name, - sizeof(suspend_stats.failed_devs[0])); - suspend_stats.last_failed_dev++; - suspend_stats.last_failed_dev %= REC_FAILED_NUM; -} - -static inline void dpm_save_failed_errno(int err) -{ - suspend_stats.errno[suspend_stats.last_failed_errno] = err; - suspend_stats.last_failed_errno++; - suspend_stats.last_failed_errno %= REC_FAILED_NUM; -} - -static inline void dpm_save_failed_step(enum suspend_stat_step step) -{ - suspend_stats.step_failures[step-1]++; - suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; - suspend_stats.last_failed_step++; - suspend_stats.last_failed_step %= REC_FAILED_NUM; -} - /** * struct platform_suspend_ops - Callbacks for managing platform dependent * system sleep states. @@ -623,4 +567,19 @@ static inline void queue_up_suspend_work(void) {} #endif /* !CONFIG_PM_AUTOSLEEP */ +enum suspend_stat_step { + SUSPEND_WORKING = 0, + SUSPEND_FREEZE, + SUSPEND_PREPARE, + SUSPEND_SUSPEND, + SUSPEND_SUSPEND_LATE, + SUSPEND_SUSPEND_NOIRQ, + SUSPEND_RESUME_NOIRQ, + SUSPEND_RESUME_EARLY, + SUSPEND_RESUME +}; + +void dpm_save_failed_dev(const char *name); +void dpm_save_failed_step(enum suspend_stat_step step); + #endif /* _LINUX_SUSPEND_H */ -- cgit v1.2.3 From c5d74fe6a7f4240f6060dc51dd113b8a45f6cb56 Mon Sep 17 00:00:00 2001 From: R SUNDAR Date: Sun, 4 Feb 2024 21:15:06 +0530 Subject: spi: Remove the @multi_cs_cap to prevent kernel-doc warnings ./include/linux/spi/spi.h:778: warning: Excess struct member 'multi_cs_cap' description in 'spi_controller' Signed-off-by: R SUNDAR Link: https://lore.kernel.org/r/20240204154506.3561-1-prosunofficial@gmail.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 600fbd5daf68..0b0616b2a9f9 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -422,8 +422,6 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @bus_lock_spinlock: spinlock for SPI bus locking * @bus_lock_mutex: mutex for exclusion of multiple callers * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use - * @multi_cs_cap: indicates that the SPI Controller can assert/de-assert - * more than one chip select at once. * @setup: updates the device mode and clocking records used by a * device's SPI controller; protocol code may call this. This * must fail if an unrecognized or unsupported mode is requested. -- cgit v1.2.3 From 0ec74ad3c157bd4bcbcc8b294777733687e8cd2a Mon Sep 17 00:00:00 2001 From: Jan Dakinevich Date: Fri, 26 Jan 2024 23:08:36 +0300 Subject: regmap: rework ->max_register handling When regmap consists of single register, 'regmap' subsystem is unable to understand whether ->max_register is set or not, because in both cases it is equal to zero. It leads to that the logic based on value of ->max_register doesn't work. For example using of REGCACHE_FLAT fails. This patch introduces an extra parameter to regmap config, indicating that zero value in ->max_register is authentic. Signed-off-by: Jan Dakinevich Link: https://lore.kernel.org/r/20240126200836.1829995-1-jan.dakinevich@salutedevices.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index c9182a47736e..b743241cfb7c 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -332,6 +332,10 @@ typedef void (*regmap_unlock)(void *); * @io_port: Support IO port accessors. Makes sense only when MMIO vs. IO port * access can be distinguished. * @max_register: Optional, specifies the maximum valid register address. + * @max_register_is_0: Optional, specifies that zero value in @max_register + * should be taken into account. This is a workaround to + * apply handling of @max_register for regmap that contains + * only one register. * @wr_table: Optional, points to a struct regmap_access_table specifying * valid ranges for write access. * @rd_table: As above, for read access. @@ -422,6 +426,7 @@ struct regmap_config { bool io_port; unsigned int max_register; + bool max_register_is_0; const struct regmap_access_table *wr_table; const struct regmap_access_table *rd_table; const struct regmap_access_table *volatile_table; -- cgit v1.2.3 From c4e47bbb00dad9240f4c054859950e962042ecb8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 14:14:59 -0700 Subject: block: move cgroup time handling code into blk.h In preparation for moving time keeping into blk.h, move the cgroup related code for timestamps in here too. This will help avoid a circular dependency, and also moves it into a more appropriate header as this one is private to the block layer code. Leave struct bio_issue in blk_types.h as it's a proper time definition. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b3..1c07848dea7e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -206,52 +206,10 @@ static inline bool blk_path_error(blk_status_t error) return true; } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - struct bio_issue { u64 value; }; -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; -- cgit v1.2.3 From da4c8c3d0975f031ef82d39927102e39fa6ddfac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Jan 2024 14:46:03 -0700 Subject: block: cache current nsec time in struct blk_plug Querying the current time is the most costly thing we do in the block layer per IO, and depending on kernel config settings, we may do it many times per IO. None of the callers actually need nsec granularity. Take advantage of that by caching the current time in the plug, with the assumption here being that any time checking will be temporally close enough that the slight loss of precision doesn't matter. If the block plug gets flushed, eg on preempt or schedule out, then we invalidate the cached clock. On a basic peak IOPS test case with iostats enabled, this changes the performance from: IOPS=108.41M, BW=52.93GiB/s, IOS/call=31/31 IOPS=108.43M, BW=52.94GiB/s, IOS/call=32/32 IOPS=108.29M, BW=52.88GiB/s, IOS/call=31/32 IOPS=108.35M, BW=52.91GiB/s, IOS/call=32/32 IOPS=108.42M, BW=52.94GiB/s, IOS/call=31/31 IOPS=108.40M, BW=52.93GiB/s, IOS/call=32/32 IOPS=108.31M, BW=52.89GiB/s, IOS/call=32/31 to IOPS=118.79M, BW=58.00GiB/s, IOS/call=31/32 IOPS=118.62M, BW=57.92GiB/s, IOS/call=31/31 IOPS=118.80M, BW=58.01GiB/s, IOS/call=32/31 IOPS=118.78M, BW=58.00GiB/s, IOS/call=32/32 IOPS=118.69M, BW=57.95GiB/s, IOS/call=32/31 IOPS=118.62M, BW=57.92GiB/s, IOS/call=32/31 IOPS=118.63M, BW=57.92GiB/s, IOS/call=31/32 which is more than a 9% improvement in performance. Looking at perf diff, we can see a huge reduction in time overhead: 10.55% -9.88% [kernel.vmlinux] [k] read_tsc 1.31% -1.22% [kernel.vmlinux] [k] ktime_get Note that since this relies on blk_plug for the caching, it's only applicable to the issue side. But this is where most of the time calls happen anyway. On the completion side, cached time stamping is done with struct io_comp patch, as long as the driver supports it. It's also worth noting that the above testing doesn't enable any of the higher cost CPU items on the block layer side, like wbt, cgroups, iocost, etc, which all would add additional time querying and hence overhead. IOW, results would likely look even better in comparison with those enabled, as distros would do. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99e4f5e72213..996d2ad756ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -942,6 +942,7 @@ struct blk_plug { /* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; + u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; -- cgit v1.2.3 From 06b23f92af87a84d70881b2ecaa72e00f7838264 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 09:18:39 -0700 Subject: block: update cached timestamp post schedule/preemption Mark the task as having a cached timestamp when set assign it, so we can efficiently check if it needs updating post being scheduled back in. This covers both the actual schedule out case, which would've flushed the plug, and the preemption case which doesn't touch the plugged requests (for many reasons, one of them being then we'd need to have preemption disabled around plug state manipulation). Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++++++++ include/linux/sched.h | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 996d2ad756ff..d7cac3de65b3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -973,6 +973,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) __blk_flush_plug(plug, async); } +/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -996,6 +1008,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..15b7cb478d16 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1642,7 +1642,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF__HOLE__20000000 0x20000000 +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ -- cgit v1.2.3 From 2719a9e7156c4b3983b43db467c1ff96801bda99 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 31 Jan 2024 23:37:25 +0100 Subject: wifi: cw1200: Convert to GPIO descriptors The CW1200 uses two GPIOs to control the powerup and reset pins, get these from GPIO descriptors instead of being passed as platform data from boardfiles. The RESET line will need to be marked as active low as we will let gpiolib handle the polarity inversion. The SDIO case is a bit special since the "card" need to be powered up before it gets detected on the SDIO bus and properly probed. Fix this by using board-specific GPIOs assigned to device "NULL". There are currently no in-tree users. Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://msgid.link/20240131-descriptors-wireless-v1-6-e1c7c5d68746@linaro.org --- include/linux/platform_data/net-cw1200.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/net-cw1200.h b/include/linux/platform_data/net-cw1200.h index c510734405bb..89d0ec6f7d46 100644 --- a/include/linux/platform_data/net-cw1200.h +++ b/include/linux/platform_data/net-cw1200.h @@ -14,8 +14,6 @@ struct cw1200_platform_data_spi { /* All others are optional */ bool have_5ghz; - int reset; /* GPIO to RSTn signal (0 disables) */ - int powerup; /* GPIO to POWERUP signal (0 disables) */ int (*power_ctrl)(const struct cw1200_platform_data_spi *pdata, bool enable); /* Control 3v3 / 1v8 supply */ int (*clk_ctrl)(const struct cw1200_platform_data_spi *pdata, @@ -30,8 +28,6 @@ struct cw1200_platform_data_sdio { /* All others are optional */ bool have_5ghz; bool no_nptb; /* SDIO hardware does not support non-power-of-2-blocksizes */ - int reset; /* GPIO to RSTn signal (0 disables) */ - int powerup; /* GPIO to POWERUP signal (0 disables) */ int irq; /* IRQ line or 0 to use SDIO IRQ */ int (*power_ctrl)(const struct cw1200_platform_data_sdio *pdata, bool enable); /* Control 3v3 / 1v8 supply */ -- cgit v1.2.3 From 8fea0c8fda30129b4168464975505d5dc9735ac1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 Feb 2024 11:34:34 -1000 Subject: usb: core: hcd: Convert from tasklet to BH workqueue The only generic interface to execute asynchronously in the BH context is tasklet; however, it's marked deprecated and has some design flaws. To replace tasklets, BH workqueue support was recently added. A BH workqueue behaves similarly to regular workqueues except that the queued work items are executed in the BH context. This patch converts usb hcd from tasklet to BH workqueue. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: Alan Stern Cc: linux-usb@vger.kernel.org --- include/linux/usb/hcd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 00724b4f6e12..f698aac71de3 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -55,7 +55,7 @@ struct giveback_urb_bh { bool high_prio; spinlock_t lock; struct list_head head; - struct tasklet_struct bh; + struct work_struct bh; struct usb_host_endpoint *completing_ep; }; -- cgit v1.2.3 From 3bc1e711c26bff01d41ad71145ecb8dcb4412576 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Feb 2024 14:19:10 -1000 Subject: workqueue: Don't implicitly make UNBOUND workqueues w/ @max_active==1 ordered 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") automoatically promoted UNBOUND workqueues w/ @max_active==1 to ordered workqueues because UNBOUND workqueues w/ @max_active==1 used to be the way to create ordered workqueues and the new NUMA support broke it. These problems can be subtle and the fact that they can only trigger on NUMA machines made them even more difficult to debug. However, overloading the UNBOUND allocation interface this way creates other issues. It's difficult to tell whether a given workqueue actually needs to be ordered and users that legitimately want a min concurrency level wq unexpectedly gets an ordered one instead. With planned UNBOUND workqueue udpates to improve execution locality and more prevalence of chiplet designs which can benefit from such improvements, this isn't a state we wanna be in forever. There aren't that many UNBOUND w/ @max_active==1 users in the tree and the preceding patches audited all and converted them to alloc_ordered_workqueue() as appropriate. This patch removes the implicit promotion of UNBOUND w/ @max_active==1 workqueues to ordered ones. v2: v1 patch incorrectly dropped !list_empty(&wq->pwqs) condition in apply_workqueue_attrs_locked() which spuriously triggers WARNING and fails workqueue creation. Fix it. Signed-off-by: Tejun Heo Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202304251050.45a5df1f-oliver.sang@intel.com --- include/linux/workqueue.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 283d7891b4c4..4ba33cf07f11 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -392,7 +392,6 @@ enum wq_flags { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ - __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ /* BH wq only allows the following flags */ __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, @@ -507,8 +506,7 @@ alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ - __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) -- cgit v1.2.3 From fd2bc4195d5107f88c1b90e1ec935888ccbfc5c0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 3 Oct 2023 20:57:20 +0300 Subject: xfrm: generalize xdo_dev_state_update_curlft to allow statistics update In order to allow drivers to fill all statistics, change the name of xdo_dev_state_update_curlft to be xdo_dev_state_update_stats. Acked-by: Steffen Klassert Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 118c40258d07..9538576dbebc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1062,7 +1062,7 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); - void (*xdo_dev_state_update_curlft) (struct xfrm_state *x); + void (*xdo_dev_state_update_stats) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); -- cgit v1.2.3 From 91a72ada66053b4dba95cf1a60a5a23fdbd6faf7 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 26 Dec 2023 10:22:08 +0200 Subject: net/mlx5: Remove initial segmentation duplicate definitions Device definitions belong in mlx5_ifc, remove the duplicates in mlx5_core.h. Signed-off-by: Gal Pressman Reviewed-by: Jianbo Liu Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6c44f107b8ba..7f5e846eb46d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10661,6 +10661,7 @@ enum { MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER = 0x0, MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED = 0x1, MLX5_INITIAL_SEG_NIC_INTERFACE_NO_DRAM_NIC = 0x2, + MLX5_INITIAL_SEG_NIC_INTERFACE_SW_RESET = 0x7, }; enum { -- cgit v1.2.3 From dad6a09f3148257ac1773cd90934d721d68ab595 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 29 Jan 2024 15:56:36 -0800 Subject: hrtimer: Report offline hrtimer enqueue The hrtimers migration on CPU-down hotplug process has been moved earlier, before the CPU actually goes to die. This leaves a small window of opportunity to queue an hrtimer in a blind spot, leaving it ignored. For example a practical case has been reported with RCU waking up a SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that way a sched/rt timer to the local offline CPU. Make sure such situations never go unnoticed and warn when that happens. Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240129235646.3171983-4-boqun.feng@gmail.com --- include/linux/hrtimer.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 87e3bedf8eb0..641c4567cfa7 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -157,6 +157,7 @@ enum hrtimer_base_type { * @max_hang_time: Maximum time spent in hrtimer_interrupt * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are * expired + * @online: CPU is online from an hrtimers point of view * @timer_waiters: A hrtimer_cancel() invocation waits for the timer * callback to finish. * @expires_next: absolute time of the next event, is required for remote @@ -179,7 +180,8 @@ struct hrtimer_cpu_base { unsigned int hres_active : 1, in_hrtirq : 1, hang_detected : 1, - softirq_activated : 1; + softirq_activated : 1, + online : 1; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; -- cgit v1.2.3 From 0bd199fd9c19aa545f677fd0a99f2be101cb6309 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 2 Feb 2024 17:41:45 +0000 Subject: net: phy: constify phydev->drv Device driver structures are shared between all devices that they match, and thus nothing should never write to the device driver structure through the phydev->drv pointer. Let's make this pointer const to catch code that attempts to do so. Suggested-by: Christian Marangi Signed-off-by: Russell King (Oracle) Link: https://lore.kernel.org/r/E1rVxXt-002YqY-9G@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index a66f07d3f5f4..ad93f8b1b128 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -638,7 +638,7 @@ struct phy_device { /* Information about the PHY type */ /* And management functions */ - struct phy_driver *drv; + const struct phy_driver *drv; struct device_link *devlink; -- cgit v1.2.3 From e2e8a142fbd988d658ccb3da1d6f4b26a39de0fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Feb 2024 18:43:47 +0100 Subject: pidfd: exit: kill the no longer used thread_group_exited() It was used by pidfd_poll() but now it has no callers. If it finally finds a modular user we can revert this change, but note that the comment above this helper and the changelog in 38fd525a4c61 ("exit: Factor thread_group_exited out of pidfd_poll") are not accurate, thread_group_exited() won't return true if all other threads have passed exit_notify() and are zombies, it returns true only when all other threads are completely gone. Not to mention that it can only work if the task identified by @pid is a thread-group leader. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240205174347.GA31461@redhat.com Reviewed-by: Tycho Andersen Signed-off-by: Christian Brauner --- include/linux/sched/signal.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 4b7664c56208..0a0e23c45406 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -735,8 +735,6 @@ static inline int thread_group_empty(struct task_struct *p) #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) -extern bool thread_group_exited(struct pid *pid); - extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); -- cgit v1.2.3 From fe3944fb245ab99570552a3bf970b00058a9ca6d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 2 Feb 2024 12:39:23 -0800 Subject: fs: Move enum rw_hint into a new header file Move enum rw_hint into a new header file to prepare for using this data type in the block layer. Add the attribute __packed to reduce the space occupied by instances of this data type from four bytes to one byte. Change the data type of i_write_hint from u8 into enum rw_hint. Reviewed-by: Christoph Hellwig Acked-by: Chao Yu # for the F2FS part Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240202203926.2478590-5-bvanassche@acm.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 16 ++-------------- include/linux/rw_hint.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 14 deletions(-) create mode 100644 include/linux/rw_hint.h (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..bdabda5dc364 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -309,19 +310,6 @@ struct address_space; struct writeback_control; struct readahead_control; -/* - * Write life time hint values. - * Stored in struct inode as u8. - */ -enum rw_hint { - WRITE_LIFE_NOT_SET = 0, - WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, - WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, - WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, - WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, - WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, -}; - /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC @@ -677,7 +665,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; - u8 i_write_hint; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h new file mode 100644 index 000000000000..309ca72f2dfb --- /dev/null +++ b/include/linux/rw_hint.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RW_HINT_H +#define _LINUX_RW_HINT_H + +#include +#include +#include + +/* Block storage write lifetime hint values. */ +enum rw_hint { + WRITE_LIFE_NOT_SET = RWH_WRITE_LIFE_NOT_SET, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +} __packed; + +/* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ +#ifndef __CHECKER__ +static_assert(sizeof(enum rw_hint) == 1); +#endif + +#endif /* _LINUX_RW_HINT_H */ -- cgit v1.2.3 From 449813515d3e5efec85206bb91588a6249a421a3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 2 Feb 2024 12:39:25 -0800 Subject: block, fs: Restore the per-bio/request data lifetime fields Restore support for passing data lifetime information from filesystems to block drivers. This patch reverts commit b179c98f7697 ("block: Remove request.write_hint") and commit c75e707fe1aa ("block: remove the per-bio/request write hint"). This patch does not modify the size of struct bio because the new bi_write_hint member fills a hole in struct bio. pahole reports the following for struct bio on an x86_64 system with this patch applied: /* size: 112, cachelines: 2, members: 20 */ /* sum members: 110, holes: 1, sum holes: 2 */ /* last cacheline: 48 bytes */ Reviewed-by: Kanchan Joshi Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240202203926.2478590-7-bvanassche@acm.org Signed-off-by: Christian Brauner --- include/linux/blk-mq.h | 2 ++ include/linux/blk_types.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..492b0128b5d9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,6 +8,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -135,6 +136,7 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif + enum rw_hint write_hint; unsigned short ioprio; enum mq_rq_state state; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b3..12d87cef2c03 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -10,6 +10,7 @@ #include #include #include +#include struct bio_set; struct bio; @@ -269,6 +270,7 @@ struct bio { */ unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; + enum rw_hint bi_write_hint; blk_status_t bi_status; atomic_t __bi_remaining; -- cgit v1.2.3 From 3ee07964d407411fd578a3bc998de44fd64d266a Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 1 Feb 2024 11:55:56 +0100 Subject: serial: core: introduce uart_port_tx_flags() And an enum with a flag: UART_TX_NOSTOP. To NOT call __port->ops->stop_tx() when the circular buffer is empty. mxs-uart needs this (see the next patch). Signed-off-by: "Jiri Slaby (SUSE)" Cc: stable Tested-by: Emil Kronborg Link: https://lore.kernel.org/r/20240201105557.28043-1-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 536b2581d3e2..55b1f3ba48ac 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -748,8 +748,17 @@ struct uart_driver { void uart_write_wakeup(struct uart_port *port); -#define __uart_port_tx(uport, ch, tx_ready, put_char, tx_done, for_test, \ - for_post) \ +/** + * enum UART_TX_FLAGS -- flags for uart_port_tx_flags() + * + * @UART_TX_NOSTOP: don't call port->ops->stop_tx() on empty buffer + */ +enum UART_TX_FLAGS { + UART_TX_NOSTOP = BIT(0), +}; + +#define __uart_port_tx(uport, ch, flags, tx_ready, put_char, tx_done, \ + for_test, for_post) \ ({ \ struct uart_port *__port = (uport); \ struct circ_buf *xmit = &__port->state->xmit; \ @@ -777,7 +786,7 @@ void uart_write_wakeup(struct uart_port *port); if (pending < WAKEUP_CHARS) { \ uart_write_wakeup(__port); \ \ - if (pending == 0) \ + if (!((flags) & UART_TX_NOSTOP) && pending == 0) \ __port->ops->stop_tx(__port); \ } \ \ @@ -812,7 +821,7 @@ void uart_write_wakeup(struct uart_port *port); */ #define uart_port_tx_limited(port, ch, count, tx_ready, put_char, tx_done) ({ \ unsigned int __count = (count); \ - __uart_port_tx(port, ch, tx_ready, put_char, tx_done, __count, \ + __uart_port_tx(port, ch, 0, tx_ready, put_char, tx_done, __count, \ __count--); \ }) @@ -826,8 +835,21 @@ void uart_write_wakeup(struct uart_port *port); * See uart_port_tx_limited() for more details. */ #define uart_port_tx(port, ch, tx_ready, put_char) \ - __uart_port_tx(port, ch, tx_ready, put_char, ({}), true, ({})) + __uart_port_tx(port, ch, 0, tx_ready, put_char, ({}), true, ({})) + +/** + * uart_port_tx_flags -- transmit helper for uart_port with flags + * @port: uart port + * @ch: variable to store a character to be written to the HW + * @flags: %UART_TX_NOSTOP or similar + * @tx_ready: can HW accept more data function + * @put_char: function to write a character + * + * See uart_port_tx_limited() for more details. + */ +#define uart_port_tx_flags(port, ch, flags, tx_ready, put_char) \ + __uart_port_tx(port, ch, flags, tx_ready, put_char, ({}), true, ({})) /* * Baud rate helpers. */ -- cgit v1.2.3 From 3058fca1ed7955c904584a6d86108d664a927177 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 13:01:31 +0200 Subject: fs: make file_dentry() a simple accessor file_dentry() is a relic from the days that overlayfs was using files with a "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. In those days, file_dentry() was needed to get the underlying fs dentry that matches f_inode. Files with "fake" path should not exist nowadays, so make file_dentry() a simple accessor and use an assertion to make sure that file_dentry() was not papering over filesystem bugs. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240202110132.1584111-2-amir73il@gmail.com Tested-by: Stefan Berger Signed-off-by: Christian Brauner --- include/linux/fs.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9efd6220b7c6..2e07cbbf92e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1084,9 +1084,20 @@ static inline struct inode *file_inode(const struct file *f) return f->f_inode; } +/* + * file_dentry() is a relic from the days that overlayfs was using files with a + * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. + * In those days, file_dentry() was needed to get the underlying fs dentry that + * matches f_inode. + * Files with "fake" path should not exist nowadays, so use an assertion to make + * sure that file_dentry() was not papering over filesystem bugs. + */ static inline struct dentry *file_dentry(const struct file *file) { - return d_real(file->f_path.dentry, file_inode(file)); + struct dentry *dentry = file->f_path.dentry; + + WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); + return dentry; } struct fasync_struct { -- cgit v1.2.3 From 11b3f8ae7081607a783d60e8098d46b787f79cad Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 13:01:32 +0200 Subject: fs: remove the inode argument to ->d_real() method The only remaining user of ->d_real() method is d_real_inode(), which passed NULL inode argument to get the real data dentry. There are no longer any users that call ->d_real() with a non-NULL inode argument for getting a detry from a specific underlying layer. Remove the inode argument of the method and replace it with an integer 'type' argument, to allow callers to request the real metadata dentry instead of the real data dentry. All the current users of d_real_inode() (e.g. uprobe) continue to get the real data inode. Caller that need to get the real metadata inode (e.g. IMA/EVM) can use d_inode(d_real(dentry, D_REAL_METADATA)). Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240202110132.1584111-3-amir73il@gmail.com Tested-by: Stefan Berger Signed-off-by: Al Viro Signed-off-by: Christian Brauner --- include/linux/dcache.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1666c387861f..d616a745a34c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -125,6 +125,11 @@ enum dentry_d_lock_class DENTRY_D_LOCK_NESTED }; +enum d_real_type { + D_REAL_DATA, + D_REAL_METADATA, +}; + struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); @@ -139,7 +144,7 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); + struct dentry *(*d_real)(struct dentry *, enum d_real_type type); } ____cacheline_aligned; /* @@ -546,24 +551,23 @@ static inline struct inode *d_backing_inode(const struct dentry *upper) /** * d_real - Return the real dentry * @dentry: the dentry to query - * @inode: inode to select the dentry from multiple layers (can be NULL) + * @type: the type of real dentry (data or metadata) * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. * * See also: Documentation/filesystems/vfs.rst */ -static inline struct dentry *d_real(struct dentry *dentry, - const struct inode *inode) +static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) - return dentry->d_op->d_real(dentry, inode); + return dentry->d_op->d_real(dentry, type); else return dentry; } /** - * d_real_inode - Return the real inode + * d_real_inode - Return the real inode hosting the data * @dentry: The dentry to query * * If dentry is on a union/overlay, then return the underlying, real inode. @@ -572,7 +576,7 @@ static inline struct dentry *d_real(struct dentry *dentry, static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ - return d_backing_inode(d_real((struct dentry *) dentry, NULL)); + return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA)); } struct name_snapshot { -- cgit v1.2.3 From 853b8d7597eea4ccaaefbcf0942cd42fc86d542a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 12:22:58 +0200 Subject: remap_range: merge do_clone_file_range() into vfs_clone_file_range() commit dfad37051ade ("remap_range: move permission hooks out of do_clone_file_range()") moved the permission hooks from do_clone_file_range() out to its caller vfs_clone_file_range(), but left all the fast sanity checks in do_clone_file_range(). This makes the expensive security hooks be called in situations that they would not have been called before (e.g. fs does not support clone). The only reason for the do_clone_file_range() helper was that overlayfs did not use to be able to call vfs_clone_file_range() from copy up context with sb_writers lock held. However, since commit c63e56a4a652 ("ovl: do not open/llseek lower file with upper sb_writers held"), overlayfs just uses an open coded version of vfs_clone_file_range(). Merge_clone_file_range() into vfs_clone_file_range(), restoring the original order of checks as it was before the regressing commit and adapt the overlayfs code to call vfs_clone_file_range() before the permission hooks that were added by commit ca7ab482401c ("ovl: add permission hooks outside of do_splice_direct()"). Note that in the merge of do_clone_file_range(), the file_start_write() context was reduced to cover ->remap_file_range() without holding it over the permission hooks, which was the reason for doing the regressing commit in the first place. Reported-and-tested-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202401312229.eddeb9a6-oliver.sang@intel.com Fixes: dfad37051ade ("remap_range: move permission hooks out of do_clone_file_range()") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240202102258.1582671-1-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..023f37c60709 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2101,9 +2101,6 @@ int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); -extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); -- cgit v1.2.3 From f814bdda774c183b0cc15ec8f3b6e7c6f4527ba5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 23 Jan 2024 18:58:26 +0100 Subject: blk-wbt: Fix detection of dirty-throttled tasks The detection of dirty-throttled tasks in blk-wbt has been subtly broken since its beginning in 2016. Namely if we are doing cgroup writeback and the throttled task is not in the root cgroup, balance_dirty_pages() will set dirty_sleep for the non-root bdi_writeback structure. However blk-wbt checks dirty_sleep only in the root cgroup bdi_writeback structure. Thus detection of recently throttled tasks is not working in this case (we noticed this when we switched to cgroup v2 and suddently writeback was slow). Since blk-wbt has no easy way to get to proper bdi_writeback and furthermore its intention has always been to work on the whole device rather than on individual cgroups, just move the dirty_sleep timestamp from bdi_writeback to backing_dev_info. That fixes the checking for recently throttled task and saves memory for everybody as a bonus. CC: stable@vger.kernel.org Fixes: b57d74aff9ab ("writeback: track if we're sleeping on progress in balance_dirty_pages()") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240123175826.21452-1-jack@suse.cz [axboe: fixup indentation errors] Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ae12696ec492..2ad261082bba 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -141,8 +141,6 @@ struct bdi_writeback { struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ - unsigned long dirty_sleep; /* last wait */ - struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK @@ -179,6 +177,11 @@ struct backing_dev_info { * any dirty wbs, which is depended upon by bdi_has_dirty(). */ atomic_long_t tot_write_bandwidth; + /* + * Jiffies when last process was dirty throttled on this bdi. Used by + * blk-wbt. + */ + unsigned long last_bdp_sleep; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct list_head wb_list; /* list of all wbs */ -- cgit v1.2.3 From 8284765f03b7a0b18968cefeb5e78aca647b8f8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Jan 2024 17:15:32 -0800 Subject: KVM: Get reference to VM's address space in the async #PF worker Get a reference to the target VM's address space in async_pf_execute() instead of gifting a reference from kvm_setup_async_pf(). Keeping the address space alive just to service an async #PF is counter-productive, i.e. if the process is exiting and all vCPUs are dead, then NOT doing get_user_pages_remote() and freeing the address space asap is desirable. Handling the mm reference entirely within async_pf_execute() also simplifies the async #PF flows as a whole, e.g. it's not immediately obvious when the worker task vs. the vCPU task is responsible for putting the gifted mm reference. Reviewed-by: Vitaly Kuznetsov Reviewed-by: Xu Yilun Link: https://lore.kernel.org/r/20240110011533.503302-4-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e7fd25b09b3..bbfefd7e612f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -238,7 +238,6 @@ struct kvm_async_pf { struct list_head link; struct list_head queue; struct kvm_vcpu *vcpu; - struct mm_struct *mm; gpa_t cr2_or_gpa; unsigned long addr; struct kvm_arch_async_pf arch; -- cgit v1.2.3 From 574849054d97cee5be78d6c149d84685647fe774 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 5 Feb 2024 05:37:28 +0000 Subject: of: property: use unsigned int return on of_graph_get_endpoint_count() Because of of_graph_get_endpoint_count() doesn't report error, just return count of endpoint, the return type should be unsigned. Tidyup it. Signed-off-by: Kuninori Morimoto Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/87plxbcvzb.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Rob Herring --- include/linux/of_graph.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h index 4d7756087b6b..a4bea62bfa29 100644 --- a/include/linux/of_graph.h +++ b/include/linux/of_graph.h @@ -41,7 +41,7 @@ struct of_endpoint { bool of_graph_is_present(const struct device_node *node); int of_graph_parse_endpoint(const struct device_node *node, struct of_endpoint *endpoint); -int of_graph_get_endpoint_count(const struct device_node *np); +unsigned int of_graph_get_endpoint_count(const struct device_node *np); struct device_node *of_graph_get_port_by_id(struct device_node *node, u32 id); struct device_node *of_graph_get_next_endpoint(const struct device_node *parent, struct device_node *previous); @@ -68,7 +68,7 @@ static inline int of_graph_parse_endpoint(const struct device_node *node, return -ENOSYS; } -static inline int of_graph_get_endpoint_count(const struct device_node *np) +static inline unsigned int of_graph_get_endpoint_count(const struct device_node *np) { return 0; } -- cgit v1.2.3 From b64691274f5d33fc9d93af73483162967f7ec5bb Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 3 Feb 2024 20:53:15 +0100 Subject: net: phy: add helper phy_advertise_eee_all Per default phylib preserves the EEE advertising at the time of phy probing. The EEE advertising can be changed from user space, in addition this helper allows to set the EEE advertising to all supported modes from drivers in kernel space. Suggested-by: Andrew Lunn Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20bfc471-aeeb-4ae4-ba09-7d6d4be6b86a@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index ad93f8b1b128..fd8dbea9b4d9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1960,6 +1960,7 @@ int phy_get_rate_matching(struct phy_device *phydev, void phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); +void phy_advertise_eee_all(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, -- cgit v1.2.3 From c0c0293cf7a0f21ef461956d44e4add718574f3f Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 6 Feb 2024 14:06:46 -0600 Subject: spi: drop gpf arg from __spi_split_transfer_maxsize() The __spi_split_transfer_maxsize() function has a gpf argument to allow callers to specify the type of memory allocation that needs to be used. However, this function only allocates struct spi_transfer and is not intended to be used from atomic contexts so this type should always be GFP_KERNEL, so we can just drop the argument. Some callers of these functions also passed GFP_DMA, but since only struct spi_transfer is allocated and not any tx/rx buffers, this is not actually necessary and is removed in this commit. Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20240206200648.1782234-1-dlechner@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 29c3e4dd5d93..9339c8ed1f8f 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1365,12 +1365,10 @@ struct spi_replaced_transfers { extern int spi_split_transfers_maxsize(struct spi_controller *ctlr, struct spi_message *msg, - size_t maxsize, - gfp_t gfp); + size_t maxsize); extern int spi_split_transfers_maxwords(struct spi_controller *ctlr, struct spi_message *msg, - size_t maxwords, - gfp_t gfp); + size_t maxwords); /*---------------------------------------------------------------------------*/ -- cgit v1.2.3 From cd7d469c25704d414d71bf3644f163fb74e7996b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 13 Oct 2023 13:55:44 +0800 Subject: libceph: fail sparse-read if the data length doesn't match Once this happens that means there have bugs. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fa018d5864e7..f66f6aac74f6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -45,6 +45,7 @@ enum ceph_sparse_read_state { CEPH_SPARSE_READ_HDR = 0, CEPH_SPARSE_READ_EXTENTS, CEPH_SPARSE_READ_DATA_LEN, + CEPH_SPARSE_READ_DATA_PRE, CEPH_SPARSE_READ_DATA, }; @@ -64,7 +65,7 @@ struct ceph_sparse_read { u64 sr_req_len; /* orig request length */ u64 sr_pos; /* current pos in buffer */ int sr_index; /* current extent index */ - __le32 sr_datalen; /* length of actual data */ + u32 sr_datalen; /* length of actual data */ u32 sr_count; /* extent count in reply */ int sr_ext_len; /* length of extent array */ struct ceph_sparse_extent *sr_extent; /* extent array */ -- cgit v1.2.3 From 8e46a2d068c92a905d01cbb018b00d66991585ab Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 14 Dec 2023 16:01:03 +0800 Subject: libceph: just wait for more data to be available on the socket A short read may occur while reading the message footer from the socket. Later, when the socket is ready for another read, the messenger invokes all read_partial_*() handlers, including read_partial_sparse_msg_data(). The expectation is that read_partial_sparse_msg_data() would bail, allowing the messenger to invoke read_partial() for the footer and pick up where it left off. However read_partial_sparse_msg_data() violates that and ends up calling into the state machine in the OSD client. The sparse-read state machine assumes that it's a new op and interprets some piece of the footer as the sparse-read header and returns bogus extents/data length, etc. To determine whether read_partial_sparse_msg_data() should bail, let's reuse cursor->total_resid. Because once it reaches to zero that means all the extents and data have been successfully received in last read, else it could break out when partially reading any of the extents and data. And then osd_sparse_read() could continue where it left off. [ idryomov: changelog ] Link: https://tracker.ceph.com/issues/63586 Fixes: d396f89db39a ("libceph: add sparse read support to msgr1") Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 2eaaabbe98cb..1717cc57cdac 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -283,7 +283,7 @@ struct ceph_msg { struct kref kref; bool more_to_follow; bool needs_out_seq; - bool sparse_read; + u64 sparse_read_total; int front_alloc_len; struct ceph_msgpool *pool; -- cgit v1.2.3 From 93630d6df7507fa4e664110b1878c06a0c00b0b9 Mon Sep 17 00:00:00 2001 From: Peter Hilber Date: Thu, 1 Feb 2024 02:04:47 +0100 Subject: timekeeping: Add clocksource ID to struct system_counterval_t Clocksource pointers can be problematic to obtain for drivers which are not clocksource drivers themselves. In particular, the RFC virtio_rtc driver [1] would require a new helper function to obtain a pointer to the ARM Generic Timer clocksource. The ptp_kvm driver also required a similar workaround. Add a clocksource ID member to struct system_counterval_t, which in the future shall identify the clocksource, and which shall replace the struct clocksource * member. By this, get_device_system_crosststamp() callers (such as virtio_rtc and ptp_kvm) will be able to supply easily accessible clocksource ids instead of clocksource pointers. [1] https://lore.kernel.org/lkml/20231218073849.35294-1-peter.hilber@opensynergy.com/ Signed-off-by: Peter Hilber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240201010453.2212371-3-peter.hilber@opensynergy.com --- include/linux/timekeeping.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7c43e98cf211..ca234fa4cc04 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -273,10 +273,15 @@ struct system_device_crosststamp { * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used by * timekeeping code to verify comparibility of two cycle values + * @cs_id: Clocksource ID corresponding to system counter value. To be + * used instead of cs in the future. + * The default ID, CSID_GENERIC, does not identify a specific + * clocksource. */ struct system_counterval_t { u64 cycles; struct clocksource *cs; + enum clocksource_ids cs_id; }; /* -- cgit v1.2.3 From a2c1fe72062a5dd69de4dfe892f6436d6c0479dd Mon Sep 17 00:00:00 2001 From: Peter Hilber Date: Thu, 1 Feb 2024 02:04:48 +0100 Subject: x86/tsc: Add clocksource ID, set system_counterval_t.cs_id Add a clocksource ID for TSC and a distinct one for the early TSC. Use distinct IDs for TSC and early TSC, since those also have distinct clocksource structs. This should help to keep existing semantics when comparing clocksources. Also, set the recently added struct system_counterval_t member cs_id to the TSC ID in the cases where the clocksource member is being set to the TSC clocksource. In the future, get_device_system_crosststamp() will compare the clocksource ID in struct system_counterval_t, rather than the clocksource. For the x86 ART related code, system_counterval_t.cs == NULL corresponds to system_counterval_t.cs_id == CSID_GENERIC (0). Signed-off-by: Peter Hilber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240201010453.2212371-4-peter.hilber@opensynergy.com --- include/linux/clocksource_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h index 16775d7d8f8d..f8467946e9ee 100644 --- a/include/linux/clocksource_ids.h +++ b/include/linux/clocksource_ids.h @@ -6,6 +6,8 @@ enum clocksource_ids { CSID_GENERIC = 0, CSID_ARM_ARCH_COUNTER, + CSID_X86_TSC_EARLY, + CSID_X86_TSC, CSID_MAX, }; -- cgit v1.2.3 From 576bd4962f19bb8f437f8cecbb25e4202438c41e Mon Sep 17 00:00:00 2001 From: Peter Hilber Date: Thu, 1 Feb 2024 02:04:49 +0100 Subject: x86/kvm, ptp/kvm: Add clocksource ID, set system_counterval_t.cs_id Add a clocksource ID for the x86 kvmclock. Also, for ptp_kvm, set the recently added struct system_counterval_t member cs_id to the clocksource ID (x86 kvmclock or ARM Generic Timer). In the future, get_device_system_crosststamp() will compare the clocksource ID in struct system_counterval_t, rather than the clocksource. For now, to avoid touching too many subsystems at once, extract the clocksource ID from the clocksource. The clocksource dereference will be removed once everything is converted over.. Signed-off-by: Peter Hilber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240201010453.2212371-5-peter.hilber@opensynergy.com --- include/linux/clocksource_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h index f8467946e9ee..a4fa3436940c 100644 --- a/include/linux/clocksource_ids.h +++ b/include/linux/clocksource_ids.h @@ -8,6 +8,7 @@ enum clocksource_ids { CSID_ARM_ARCH_COUNTER, CSID_X86_TSC_EARLY, CSID_X86_TSC, + CSID_X86_KVM_CLK, CSID_MAX, }; -- cgit v1.2.3 From 9be3b2f057d7a6752e8cf25c1d456198b4d3bd6a Mon Sep 17 00:00:00 2001 From: Peter Hilber Date: Thu, 1 Feb 2024 02:04:50 +0100 Subject: ptp/kvm, arm_arch_timer: Set system_counterval_t.cs_id to constant Identify the clocksources used by ptp_kvm by setting the clocksource ID enum constants. This avoids dereferencing struct clocksource. Once the system_counterval_t.cs member will be removed, this will also avoid the need to obtain clocksource pointers from kvm_arch_ptp_get_crosststamp(). The clocksource IDs are associated to timestamps requested from the KVM hypervisor, so the proper clocksource ID is known at the ptp_kvm request site. While at it, also make the ptp_kvm_get_time_fn() 'ret' variable type int as that's what the function return value is. Signed-off-by: Peter Hilber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240201010453.2212371-6-peter.hilber@opensynergy.com --- include/linux/ptp_kvm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h index 746fd67c3480..95b3d4d0d7dd 100644 --- a/include/linux/ptp_kvm.h +++ b/include/linux/ptp_kvm.h @@ -8,6 +8,7 @@ #ifndef _PTP_KVM_H_ #define _PTP_KVM_H_ +#include #include struct timespec64; @@ -17,6 +18,7 @@ int kvm_arch_ptp_init(void); void kvm_arch_ptp_exit(void); int kvm_arch_ptp_get_clock(struct timespec64 *ts); int kvm_arch_ptp_get_crosststamp(u64 *cycle, - struct timespec64 *tspec, struct clocksource **cs); + struct timespec64 *tspec, struct clocksource **cs, + enum clocksource_ids *cs_id); #endif /* _PTP_KVM_H_ */ -- cgit v1.2.3 From 4b7f521229ef4eee06848427d865954e6e0e3675 Mon Sep 17 00:00:00 2001 From: Peter Hilber Date: Thu, 1 Feb 2024 02:04:51 +0100 Subject: timekeeping: Evaluate system_counterval_t.cs_id instead of .cs Clocksource pointers can be problematic to obtain for drivers which are not clocksource drivers themselves. In particular, the RFC virtio_rtc driver [1] would require a new helper function to obtain a pointer to the ARM Generic Timer clocksource. The ptp_kvm driver also required a similar workaround. Address this by evaluating the clocksource ID, rather than the clocksource pointer, of struct system_counterval_t. By this, setting the clocksource pointer becomes unneeded, and get_device_system_crosststamp() callers will no longer need to supply clocksource pointers. All relevant clocksource drivers provide the ID, so this change is not changing the behaviour. [1] https://lore.kernel.org/lkml/20231218073849.35294-1-peter.hilber@opensynergy.com/ Signed-off-by: Peter Hilber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240201010453.2212371-7-peter.hilber@opensynergy.com --- include/linux/timekeeping.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ca234fa4cc04..3538c5bdf9ee 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -268,13 +268,13 @@ struct system_device_crosststamp { }; /** - * struct system_counterval_t - system counter value with the pointer to the + * struct system_counterval_t - system counter value with the ID of the * corresponding clocksource * @cycles: System counter value - * @cs: Clocksource corresponding to system counter value. Used by - * timekeeping code to verify comparibility of two cycle values - * @cs_id: Clocksource ID corresponding to system counter value. To be - * used instead of cs in the future. + * @cs: Clocksource corresponding to system counter value. Timekeeping + * code now evaluates cs_id instead. + * @cs_id: Clocksource ID corresponding to system counter value. Used by + * timekeeping code to verify comparability of two cycle values. * The default ID, CSID_GENERIC, does not identify a specific * clocksource. */ -- cgit v1.2.3 From b152688c91313ab4073cff4a5e63ff4cc491c358 Mon Sep 17 00:00:00 2001 From: Peter Hilber Date: Thu, 1 Feb 2024 02:04:52 +0100 Subject: treewide: Remove system_counterval_t.cs, which is never read The clocksource pointer in struct system_counterval_t is not evaluated any more. Remove the code setting the member, and the member itself. Signed-off-by: Peter Hilber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240201010453.2212371-8-peter.hilber@opensynergy.com --- include/linux/ptp_kvm.h | 4 +--- include/linux/timekeeping.h | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h index 95b3d4d0d7dd..e8c74fa3f455 100644 --- a/include/linux/ptp_kvm.h +++ b/include/linux/ptp_kvm.h @@ -12,13 +12,11 @@ #include struct timespec64; -struct clocksource; int kvm_arch_ptp_init(void); void kvm_arch_ptp_exit(void); int kvm_arch_ptp_get_clock(struct timespec64 *ts); int kvm_arch_ptp_get_crosststamp(u64 *cycle, - struct timespec64 *tspec, struct clocksource **cs, - enum clocksource_ids *cs_id); + struct timespec64 *tspec, enum clocksource_ids *cs_id); #endif /* _PTP_KVM_H_ */ diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 3538c5bdf9ee..7e50cbd97f86 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -271,8 +271,6 @@ struct system_device_crosststamp { * struct system_counterval_t - system counter value with the ID of the * corresponding clocksource * @cycles: System counter value - * @cs: Clocksource corresponding to system counter value. Timekeeping - * code now evaluates cs_id instead. * @cs_id: Clocksource ID corresponding to system counter value. Used by * timekeeping code to verify comparability of two cycle values. * The default ID, CSID_GENERIC, does not identify a specific @@ -280,7 +278,6 @@ struct system_device_crosststamp { */ struct system_counterval_t { u64 cycles; - struct clocksource *cs; enum clocksource_ids cs_id; }; -- cgit v1.2.3 From 7412dc6d55eed6b76180e40ac3601412ebde29bd Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 7 Feb 2024 14:47:03 +0106 Subject: dump_stack: Do not get cpu_sync for panic CPU dump_stack() is called in panic(). If for some reason another CPU is holding the printk_cpu_sync and is unable to release it, the panic CPU will be unable to continue and print the stacktrace. Since non-panic CPUs are not allowed to store new printk messages anyway, there is no need to synchronize the stacktrace output in a panic situation. For the panic CPU, do not get the printk_cpu_sync because it is not needed and avoids a potential deadlock scenario in panic(). Link: https://lore.kernel.org/lkml/ZcIGKU8sxti38Kok@alley Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240207134103.1357162-15-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/printk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 8ef499ab3c1e..955e31860095 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -273,6 +273,8 @@ static inline void printk_trigger_flush(void) } #endif +bool this_cpu_in_panic(void); + #ifdef CONFIG_SMP extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); -- cgit v1.2.3 From d160c66cda0ac8614adc53a5b5b0e6d6f1a05a5b Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 5 Feb 2024 12:30:22 +0200 Subject: net: Do not return value from init_dummy_netdev() init_dummy_netdev() always returns zero and all the callers do not check the returned value. Set the function to not return value, as it is not really used today. Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240205103022.440946-1-amcohen@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 118c40258d07..1845dd5043b4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3198,7 +3198,7 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); void netdev_freemem(struct net_device *dev); -int init_dummy_netdev(struct net_device *dev); +void init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, -- cgit v1.2.3 From ccb49011bb2ebfd66164dbf68c5bff48917bb5ef Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 6 Feb 2024 15:08:19 +0100 Subject: quota: Properly annotate i_dquot arrays with __rcu Dquots pointed to from i_dquot arrays in inodes are protected by dquot_srcu. Annotate them as such and change .get_dquots callback to return properly annotated pointer to make sparse happy. Fixes: b9ba6f94b238 ("quota: remove dqptr_sem") Signed-off-by: Jan Kara --- include/linux/fs.h | 2 +- include/linux/shmem_fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..d0b849e4f6cd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2159,7 +2159,7 @@ struct super_operations { #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - struct dquot **(*get_dquots)(struct inode *); + struct dquot __rcu **(*get_dquots)(struct inode *); #endif long (*nr_cached_objects)(struct super_block *, struct shrink_control *); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 2caa6b86106a..66828dfc6e74 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -37,7 +37,7 @@ struct shmem_inode_info { unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ #ifdef CONFIG_TMPFS_QUOTA - struct dquot *i_dquot[MAXQUOTAS]; + struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif struct inode vfs_inode; }; -- cgit v1.2.3 From 2259233110d90059187c5ba75537eb93eba8417b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Feb 2024 19:40:30 +0100 Subject: spi: bitbang: Follow renaming of SPI "master" to "controller" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 8caab75fd2c2 ("spi: Generalize SPI "master" to "controller"") some functions and struct members were renamed. To not break all drivers compatibility macros were provided. To be able to remove these compatibility macros push the renaming into the SPI bitbang controller drivers. Acked-by: Jonathan Cameron Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/f7f949feb803acb8bea75798f41371a13287f4e8.1707324794.git.u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- include/linux/spi/spi_bitbang.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h index 4444c2a992cb..b930eca2ef7b 100644 --- a/include/linux/spi/spi_bitbang.h +++ b/include/linux/spi/spi_bitbang.h @@ -10,7 +10,7 @@ struct spi_bitbang { u8 use_dma; u16 flags; /* extra spi->mode support */ - struct spi_master *master; + struct spi_controller *ctlr; /* setup_transfer() changes clock and/or wordsize to match settings * for this transfer; zeroes restore defaults from spi_device. -- cgit v1.2.3 From 620d269f29a569ba37419cc03cf1da2d55f6252a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Feb 2024 19:40:45 +0100 Subject: spi: Drop compat layer from renaming "master" to "controller" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that all in-tree users followed the rename, the compat stuff can go away. This completes the renaming started with commit 8caab75fd2c2 ("spi: Generalize SPI "master" to "controller"") Acked-by: Jonathan Cameron Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/ad1d949325b61a4682e8d6ecf9d05da751e6a99f.1707324794.git.u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 600fbd5daf68..30ada46b51cd 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -131,7 +131,6 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * struct spi_device - Controller side proxy for an SPI slave device * @dev: Driver model representation of the device. * @controller: SPI controller used with the device. - * @master: Copy of controller, for backwards compatibility. * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. @@ -185,7 +184,6 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, struct spi_device { struct device dev; struct spi_controller *controller; - struct spi_controller *master; /* Compatibility layer */ u32 max_speed_hz; u8 chip_select[SPI_CS_CNT_MAX]; u8 bits_per_word; @@ -1298,7 +1296,7 @@ spi_max_transfer_size(struct spi_device *spi) */ static inline bool spi_is_bpw_supported(struct spi_device *spi, u32 bpw) { - u32 bpw_mask = spi->master->bits_per_word_mask; + u32 bpw_mask = spi->controller->bits_per_word_mask; if (bpw == 8 || (bpw <= 32 && bpw_mask & SPI_BPW_MASK(bpw))) return true; @@ -1670,20 +1668,4 @@ spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer) return list_is_last(&xfer->transfer_list, &ctlr->cur_msg->transfers); } -/* Compatibility layer */ -#define spi_master spi_controller - -#define spi_master_get_devdata(_ctlr) spi_controller_get_devdata(_ctlr) -#define spi_master_set_devdata(_ctlr, _data) \ - spi_controller_set_devdata(_ctlr, _data) -#define spi_master_get(_ctlr) spi_controller_get(_ctlr) -#define spi_master_put(_ctlr) spi_controller_put(_ctlr) -#define spi_master_suspend(_ctlr) spi_controller_suspend(_ctlr) -#define spi_master_resume(_ctlr) spi_controller_resume(_ctlr) - -#define spi_register_master(_ctlr) spi_register_controller(_ctlr) -#define devm_spi_register_master(_dev, _ctlr) \ - devm_spi_register_controller(_dev, _ctlr) -#define spi_unregister_master(_ctlr) spi_unregister_controller(_ctlr) - #endif /* __LINUX_SPI_H */ -- cgit v1.2.3 From c478db84c8544156b80c5e5d3a8c7840d557707a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 19:34:45 +0100 Subject: wifi: mac80211: refactor puncturing bitmap extraction Add a new inline helper function to ieee80211.h to extract the disabled subchannels bitmap from an EHT operation element, and use that in mac80211 where we do that. Link: https://msgid.link/20240129194108.d9f50dcec8d0.I8b08cbc2490a734fafcce0fa0fc328211ba6f10b@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a70388ae3a7b..d9d2c1253157 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3189,6 +3189,22 @@ ieee80211_eht_oper_size_ok(const u8 *data, u8 len) return len >= needed; } +/* must validate ieee80211_eht_oper_size_ok() first */ +static inline u16 +ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper) +{ + const struct ieee80211_eht_operation_info *info = + (const void *)eht_oper->optional; + + if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) + return 0; + + if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) + return 0; + + return get_unaligned_le16(info->optional); +} + #define IEEE80211_BW_IND_DIS_SUBCH_PRESENT BIT(1) struct ieee80211_bandwidth_indication { -- cgit v1.2.3 From 3c7a8e190bc580813ddd9259f62971c8d2a6b5ad Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Dec 2023 11:15:28 -0500 Subject: uapi: introduce uapi-friendly macros for GENMASK Move __GENMASK and __GENMASK_ULL from include/ to include/uapi/ so that they can be used to define masks in userspace API headers. Compared to what is already in include/linux/bits.h, the definitions need to use the uglified versions of UL(), ULL(), BITS_PER_LONG and BITS_PER_LONG_LONG (which did not even exist), but otherwise expand to the same content. Signed-off-by: Paolo Bonzini --- include/linux/bits.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bits.h b/include/linux/bits.h index 7c0cf5031abe..0eb24d21aac2 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -4,6 +4,7 @@ #include #include +#include #include #define BIT_MASK(nr) (UL(1) << ((nr) % BITS_PER_LONG)) @@ -30,15 +31,8 @@ #define GENMASK_INPUT_CHECK(h, l) 0 #endif -#define __GENMASK(h, l) \ - (((~UL(0)) - (UL(1) << (l)) + 1) & \ - (~UL(0) >> (BITS_PER_LONG - 1 - (h)))) #define GENMASK(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) - -#define __GENMASK_ULL(h, l) \ - (((~ULL(0)) - (ULL(1) << (l)) + 1) & \ - (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define GENMASK_ULL(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) -- cgit v1.2.3 From a3c78778f50c4db6cc0bb6aa2986c0174b1267d0 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:38 +0000 Subject: PM: EM: Refactor em_pd_get_efficient_state() to be more flexible The Energy Model (EM) is going to support runtime modification. There are going to be 2 EM tables which store information. This patch aims to prepare the code to be generic and use one of the tables. The function will no longer get a pointer to 'struct em_perf_domain' (the EM) but instead a pointer to 'struct em_perf_state' (which is one of the EM's tables). Prepare em_pd_get_efficient_state() for the upcoming changes and make it possible to be re-used. Return an index for the best performance state for a given EM table. The function arguments that are introduced should allow to work on different performance state arrays. The caller of em_pd_get_efficient_state() should be able to use the index either on the default or the modifiable EM table. Reviewed-by: Daniel Lezcano Reviewed-by: Hongyan Xia Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 88d91e087471..1dcd1645dde7 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -175,33 +175,35 @@ void em_dev_unregister_perf_domain(struct device *dev); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM - * @pd : Performance domain for which we want an efficient frequency - * @freq : Frequency to map with the EM + * @table: List of performance states, in ascending order + * @nr_perf_states: Number of performance states + * @freq: Frequency to map with the EM + * @pd_flags: Performance Domain flags * * It is called from the scheduler code quite frequently and as a consequence * doesn't implement any check. * - * Return: An efficient performance state, high enough to meet @freq + * Return: An efficient performance state id, high enough to meet @freq * requirement. */ -static inline -struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd, - unsigned long freq) +static inline int +em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, + unsigned long freq, unsigned long pd_flags) { struct em_perf_state *ps; int i; - for (i = 0; i < pd->nr_perf_states; i++) { - ps = &pd->table[i]; + for (i = 0; i < nr_perf_states; i++) { + ps = &table[i]; if (ps->frequency >= freq) { - if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && + if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && ps->flags & EM_PERF_STATE_INEFFICIENT) continue; - break; + return i; } } - return ps; + return nr_perf_states - 1; } /** @@ -226,7 +228,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; - int cpu; + int cpu, i; if (!sum_util) return 0; @@ -250,7 +252,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - ps = em_pd_get_efficient_state(pd, freq); + i = em_pd_get_efficient_state(pd->table, pd->nr_perf_states, freq, + pd->flags); + ps = &pd->table[i]; /* * The capacity of a CPU in the domain at the performance state (ps) -- cgit v1.2.3 From ca0fc871f16f4bef746b5ba814b67afb59119700 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:42 +0000 Subject: PM: EM: Introduce runtime modifiable table The new runtime table can be populated with a new power data to better reflect the actual efficiency of the device e.g. CPU. The power can vary over time e.g. due to the SoC temperature change. Higher temperature can increase power values. For longer running scenarios, such as game or camera, when also other devices are used (e.g. GPU, ISP) the CPU power can change. The new EM framework is able to addresses this issue and change the EM data at runtime safely. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 1dcd1645dde7..8ddf1d8a9581 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -36,9 +36,20 @@ struct em_perf_state { */ #define EM_PERF_STATE_INEFFICIENT BIT(0) +/** + * struct em_perf_table - Performance states table + * @rcu: RCU used for safe access and destruction + * @state: List of performance states, in ascending order + */ +struct em_perf_table { + struct rcu_head rcu; + struct em_perf_state state[]; +}; + /** * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order + * @em_table: Pointer to the runtime modifiable em_perf_table * @nr_perf_states: Number of performance states * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here @@ -54,6 +65,7 @@ struct em_perf_state { */ struct em_perf_domain { struct em_perf_state *table; + struct em_perf_table __rcu *em_table; int nr_perf_states; unsigned long flags; unsigned long cpus[]; -- cgit v1.2.3 From aa11a7ebfd5d698f541641922beede1cb474bf70 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:43 +0000 Subject: PM: EM: Use runtime modified EM for CPUs energy estimation in EAS The new Energy Model (EM) supports runtime modification of the performance state table to better model the power used by the SoC. Use this new feature to improve energy estimation and therefore task placement in Energy Aware Scheduler (EAS). Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 8ddf1d8a9581..5f842da3bb0c 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -239,9 +239,14 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long allowed_cpu_cap) { unsigned long freq, ref_freq, scale_cpu; + struct em_perf_table *em_table; struct em_perf_state *ps; int cpu, i; +#ifdef CONFIG_SCHED_DEBUG + WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); +#endif + if (!sum_util) return 0; @@ -264,9 +269,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - i = em_pd_get_efficient_state(pd->table, pd->nr_perf_states, freq, - pd->flags); - ps = &pd->table[i]; + em_table = rcu_dereference(pd->em_table); + i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states, + freq, pd->flags); + ps = &em_table->state[i]; /* * The capacity of a CPU in the domain at the performance state (ps) -- cgit v1.2.3 From ffcf9bce7af02a21fb73738999de1e3d4fde5aca Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:44 +0000 Subject: PM: EM: Add functions for memory allocations for new EM tables The runtime modified EM table can be provided from drivers. Create mechanism which allows safely allocate and free the table for device drivers. The same table can be used by the EAS in task scheduler code paths, so make sure the memory is not freed when the device driver module is unloaded. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 5f842da3bb0c..27911dc1887e 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -39,10 +40,12 @@ struct em_perf_state { /** * struct em_perf_table - Performance states table * @rcu: RCU used for safe access and destruction + * @kref: Reference counter to track the users * @state: List of performance states, in ascending order */ struct em_perf_table { struct rcu_head rcu; + struct kref kref; struct em_perf_state state[]; }; @@ -184,6 +187,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *span, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); +struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); +void em_table_free(struct em_perf_table __rcu *table); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -365,6 +370,12 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) { return 0; } +static inline +struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +{ + return NULL; +} +static inline void em_table_free(struct em_perf_table __rcu *table) {} #endif #endif -- cgit v1.2.3 From 977230d5d50314f9920d3ee6348773d8babbfb58 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:45 +0000 Subject: PM: EM: Introduce em_dev_update_perf_domain() for EM updates Add API function em_dev_update_perf_domain() which allows the EM to be changed safely. Concurrent updaters are serialized with a mutex and the removal of memory that will not be used any more is carried out with the help of RCU. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 27911dc1887e..324a3a8e0a2d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -183,6 +183,8 @@ struct em_data_callback { struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); +int em_dev_update_perf_domain(struct device *dev, + struct em_perf_table __rcu *new_table); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *span, bool microwatts); @@ -376,6 +378,12 @@ struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) return NULL; } static inline void em_table_free(struct em_perf_table __rcu *table) {} +static inline +int em_dev_update_perf_domain(struct device *dev, + struct em_perf_table __rcu *new_table) +{ + return -EINVAL; +} #endif #endif -- cgit v1.2.3 From ee1a19873ce1234a3c2e6f84af3624fc73bfbd9c Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:46 +0000 Subject: PM: EM: Add em_perf_state_from_pd() to get performance states table Introduce a wrapper to get the performance states table of the performance domain. The function should be called within the RCU read critical section. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 324a3a8e0a2d..158dad6ea313 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -338,6 +338,23 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) return pd->nr_perf_states; } +/** + * em_perf_state_from_pd() - Get the performance states table of perf. + * domain + * @pd : performance domain for which this must be done + * + * To use this function the rcu_read_lock() should be hold. After the usage + * of the performance states table is finished, the rcu_read_unlock() should + * be called. + * + * Return: the pointer to performance states table of the performance domain + */ +static inline +struct em_perf_state *em_perf_state_from_pd(struct em_perf_domain *pd) +{ + return rcu_dereference(pd->em_table)->state; +} + #else struct em_data_callback {}; #define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { } @@ -384,6 +401,11 @@ int em_dev_update_perf_domain(struct device *dev, { return -EINVAL; } +static inline +struct em_perf_state *em_perf_state_from_pd(struct em_perf_domain *pd) +{ + return NULL; +} #endif #endif -- cgit v1.2.3 From 5a367f7b7014af86bd1ac0865a42db55187dbd3c Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:47 +0000 Subject: PM: EM: Add performance field to struct em_perf_state and optimize The performance doesn't scale linearly with the frequency. Also, it may be different in different workloads. Some CPUs are designed to be particularly good at some applications e.g. images or video processing and other CPUs in different. When those different types of CPUs are combined in one SoC they should be properly modeled to get max of the HW in Energy Aware Scheduler (EAS). The Energy Model (EM) provides the power vs. performance curves to the EAS, but assumes the CPUs capacity is fixed and scales linearly with the frequency. This patch allows to adjust the curve on the 'performance' axis as well. Code speed optimization: Removing map_util_freq() allows to avoid one division and one multiplication operations from the EAS hot code path. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 158dad6ea313..ce24ea3fe41c 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -13,6 +13,7 @@ /** * struct em_perf_state - Performance state of a performance domain + * @performance: CPU performance (capacity) at a given frequency * @frequency: The frequency in KHz, for consistency with CPUFreq * @power: The power consumed at this level (by 1 CPU or by a registered * device). It can be a total power: static and dynamic. @@ -21,6 +22,7 @@ * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { + unsigned long performance; unsigned long frequency; unsigned long power; unsigned long cost; @@ -196,25 +198,25 @@ void em_table_free(struct em_perf_table __rcu *table); * em_pd_get_efficient_state() - Get an efficient performance state from the EM * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states - * @freq: Frequency to map with the EM + * @max_util: Max utilization to map with the EM * @pd_flags: Performance Domain flags * * It is called from the scheduler code quite frequently and as a consequence * doesn't implement any check. * - * Return: An efficient performance state id, high enough to meet @freq + * Return: An efficient performance state id, high enough to meet @max_util * requirement. */ static inline int em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states, - unsigned long freq, unsigned long pd_flags) + unsigned long max_util, unsigned long pd_flags) { struct em_perf_state *ps; int i; for (i = 0; i < nr_perf_states; i++) { ps = &table[i]; - if (ps->frequency >= freq) { + if (ps->performance >= max_util) { if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && ps->flags & EM_PERF_STATE_INEFFICIENT) continue; @@ -245,9 +247,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util, unsigned long allowed_cpu_cap) { - unsigned long freq, ref_freq, scale_cpu; struct em_perf_table *em_table; struct em_perf_state *ps; + unsigned long scale_cpu; int cpu, i; #ifdef CONFIG_SCHED_DEBUG @@ -260,25 +262,23 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, /* * In order to predict the performance state, map the utilization of * the most utilized CPU of the performance domain to a requested - * frequency, like schedutil. Take also into account that the real - * frequency might be set lower (due to thermal capping). Thus, clamp + * performance, like schedutil. Take also into account that the real + * performance might be set lower (due to thermal capping). Thus, clamp * max utilization to the allowed CPU capacity before calculating - * effective frequency. + * effective performance. */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ref_freq = arch_scale_freq_ref(cpu); max_util = min(max_util, allowed_cpu_cap); - freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the - * requested frequency. + * requested performance. */ em_table = rcu_dereference(pd->em_table); i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states, - freq, pd->flags); + max_util, pd->flags); ps = &em_table->state[i]; /* -- cgit v1.2.3 From 1b600da510735a0f92c8b4140a7e2cb037a6a6c3 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:49 +0000 Subject: PM: EM: Optimize em_cpu_energy() and remove division The Energy Model (EM) can be modified at runtime which brings new possibilities. The em_cpu_energy() is called by the Energy Aware Scheduler (EAS) in its hot path. The energy calculation uses power value for a given performance state (ps) and the CPU busy time as percentage for that given frequency. It is possible to avoid the division by 'scale_cpu' at runtime, because EM is updated whenever new max capacity CPU is set in the system. Use that feature and do the needed division during the calculation of the coefficient 'ps->cost'. That enhanced 'ps->cost' value can be then just multiplied simply by utilization: pd_nrg = ps->cost * \Sum cpu_util to get the needed energy for whole Performance Domain (PD). With this optimization and earlier removal of map_util_freq(), the em_cpu_energy() should run faster on the Big CPU by 1.43x and on the Little CPU by 1.69x (RockPi 4B board). Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 55 ++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index ce24ea3fe41c..aabfc26fcd31 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -115,27 +115,6 @@ struct em_perf_domain { #define EM_MAX_NUM_CPUS 16 #endif -/* - * To avoid an overflow on 32bit machines while calculating the energy - * use a different order in the operation. First divide by the 'cpu_scale' - * which would reduce big value stored in the 'cost' field, then multiply by - * the 'sum_util'. This would allow to handle existing platforms, which have - * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts. - * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util' - * could be 4096, then multiplication: 'cost' * 'sum_util' would overflow. - * This reordering of operations has some limitations, we lose small - * precision in the estimation (comparing to 64bit platform w/o reordering). - * - * We are safe on 64bit machine. - */ -#ifdef CONFIG_64BIT -#define em_estimate_energy(cost, sum_util, scale_cpu) \ - (((cost) * (sum_util)) / (scale_cpu)) -#else -#define em_estimate_energy(cost, sum_util, scale_cpu) \ - (((cost) / (scale_cpu)) * (sum_util)) -#endif - struct em_data_callback { /** * active_power() - Provide power at the next performance state of @@ -249,8 +228,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { struct em_perf_table *em_table; struct em_perf_state *ps; - unsigned long scale_cpu; - int cpu, i; + int i; #ifdef CONFIG_SCHED_DEBUG WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); @@ -267,9 +245,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * max utilization to the allowed CPU capacity before calculating * effective performance. */ - cpu = cpumask_first(to_cpumask(pd->cpus)); - scale_cpu = arch_scale_cpu_capacity(cpu); - + max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); /* @@ -282,12 +258,12 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, ps = &em_table->state[i]; /* - * The capacity of a CPU in the domain at the performance state (ps) - * can be computed as: + * The performance (capacity) of a CPU in the domain at the performance + * state (ps) can be computed as: * - * ps->freq * scale_cpu - * ps->cap = -------------------- (1) - * cpu_max_freq + * ps->freq * scale_cpu + * ps->performance = -------------------- (1) + * cpu_max_freq * * So, ignoring the costs of idle states (which are not available in * the EM), the energy consumed by this CPU at that performance state @@ -295,9 +271,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * * ps->power * cpu_util * cpu_nrg = -------------------- (2) - * ps->cap + * ps->performance * - * since 'cpu_util / ps->cap' represents its percentage of busy time. + * since 'cpu_util / ps->performance' represents its percentage of busy + * time. * * NOTE: Although the result of this computation actually is in * units of power, it can be manipulated as an energy value @@ -307,9 +284,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product * of two terms: * - * ps->power * cpu_max_freq cpu_util - * cpu_nrg = ------------------------ * --------- (3) - * ps->freq scale_cpu + * ps->power * cpu_max_freq + * cpu_nrg = ------------------------ * cpu_util (3) + * ps->freq * scale_cpu * * The first term is static, and is stored in the em_perf_state struct * as 'ps->cost'. @@ -319,11 +296,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * total energy of the domain (which is the simple sum of the energy of * all of its CPUs) can be factorized as: * - * ps->cost * \Sum cpu_util - * pd_nrg = ------------------------ (4) - * scale_cpu + * pd_nrg = ps->cost * \Sum cpu_util (4) */ - return em_estimate_energy(ps->cost, sum_util, scale_cpu); + return ps->cost * sum_util; } /** -- cgit v1.2.3 From 24e9fb635df2790eccb0e95ff65c6dee7a97fcb7 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:55 +0000 Subject: PM: EM: Remove old table Remove the old EM table which wasn't able to modify the data. Clean the unneeded function and refactor the code a bit. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index aabfc26fcd31..92866a81abe4 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -53,7 +53,6 @@ struct em_perf_table { /** * struct em_perf_domain - Performance domain - * @table: List of performance states, in ascending order * @em_table: Pointer to the runtime modifiable em_perf_table * @nr_perf_states: Number of performance states * @flags: See "em_perf_domain flags" @@ -69,7 +68,6 @@ struct em_perf_table { * field is unused. */ struct em_perf_domain { - struct em_perf_state *table; struct em_perf_table __rcu *em_table; int nr_perf_states; unsigned long flags; -- cgit v1.2.3 From 22ea02848c07d1cbd15a5f442138ca429866300d Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 8 Feb 2024 11:55:56 +0000 Subject: PM: EM: Add em_dev_compute_costs() The device drivers can modify EM at runtime by providing a new EM table. The EM is used by the EAS and the em_perf_state::cost stores pre-calculated value to avoid overhead. This patch provides the API for device drivers to calculate the cost values properly (and not duplicate the same code). Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 92866a81abe4..770755df852f 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -170,6 +170,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, void em_dev_unregister_perf_domain(struct device *dev); struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table __rcu *table); +int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, + int nr_states); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -379,6 +381,12 @@ struct em_perf_state *em_perf_state_from_pd(struct em_perf_domain *pd) { return NULL; } +static inline +int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, + int nr_states) +{ + return -EINVAL; +} #endif #endif -- cgit v1.2.3 From b1344b1399daec9aca62bd0b2ea94874f5b8e126 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 20:04:56 +0100 Subject: wifi: mac80211: add/use ieee80211_get_sn() This will also be useful for MLO duplicate multicast detection, but add it already here and use it in one place that trivially converts. Link: https://msgid.link/20240129200456.f0ff49c80006.I850d2785ab1640e56e262d3ad7343b87f6962552@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d9d2c1253157..b9367d5f04c4 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2023 Intel Corporation + * Copyright (c) 2018 - 2024 Intel Corporation */ #ifndef LINUX_IEEE80211_H @@ -808,6 +808,11 @@ static inline bool ieee80211_is_frag(struct ieee80211_hdr *hdr) hdr->seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG); } +static inline u16 ieee80211_get_sn(struct ieee80211_hdr *hdr) +{ + return le16_get_bits(hdr->seq_ctrl, IEEE80211_SCTL_SEQ); +} + struct ieee80211s_hdr { u8 flags; u8 ttl; -- cgit v1.2.3 From 676259100cf3a81dd2d47918b36edb237986b9df Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 20:04:57 +0100 Subject: wifi: mac80211: implement MLO multicast deduplication If the vif is an MLD then it may receive multicast from different links, and should drop those frames according to the SN. Implement that. Link: https://msgid.link/20240129200456.693b77d14b44.I491846f2bea0058c14eab6422962c10bfae9b675@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b9367d5f04c4..e9078143b822 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -191,6 +191,11 @@ static inline bool ieee80211_sn_less(u16 sn1, u16 sn2) return ((sn1 - sn2) & IEEE80211_SN_MASK) > (IEEE80211_SN_MODULO >> 1); } +static inline bool ieee80211_sn_less_eq(u16 sn1, u16 sn2) +{ + return ((sn2 - sn1) & IEEE80211_SN_MASK) <= (IEEE80211_SN_MODULO >> 1); +} + static inline u16 ieee80211_sn_add(u16 sn1, u16 sn2) { return (sn1 + sn2) & IEEE80211_SN_MASK; -- cgit v1.2.3 From 6239da18d2f947523a80fb1f85f8d8a13d1726c1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Jan 2024 20:19:28 +0100 Subject: wifi: mac80211: adjust EHT capa when lowering bandwidth If intending to associate with a lower bandwidth, remove capabilities related to 320 MHz from the EHT capabilities element. Also change the EHT MCS-NSS set accordingly: if just reducing 320->160 or similar the format doesn't change, just cut off the last bytes. If changing from higher bandwidth to 20 MHz only EHT STA, adjust the format. Note that this also requires adjusting the caller in mlme.c since the data written can now be shorter than it determined. We need to clean all that up. Since the other callers pass NULL for the conn limit, we don't need to change things there. Link: https://msgid.link/20240129202041.b5f6df108c77.I0d8ea04079c61cb3744cc88625eeaf0d4776dc2b@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e9078143b822..e4322238f273 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3060,6 +3060,9 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF 0x40 #define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x08 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x30 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ 0x40 #define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 #define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 -- cgit v1.2.3 From a4af51ce229b1e1eab003966dbfebf9d80093a77 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 21:56:15 -0500 Subject: fs: super_set_uuid() Some weird old filesytems have UUID-like things that we wish to expose as UUIDs, but are smaller; add a length field so that the new FS_IOC_(GET|SET)UUID ioctls can handle them in generic code. And add a helper super_set_uuid(), for setting nonstandard length uuids. Helper is now required for the new FS_IOC_GETUUID ioctl; if super_set_uuid() hasn't been called, the ioctl won't be supported. Reviewed-by: Dave Chinner Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20240207025624.1019754-2-kent.overstreet@linux.dev Signed-off-by: Christian Brauner --- include/linux/fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..acdc56987cb1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1257,6 +1257,7 @@ struct super_block { char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ + u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ unsigned int s_max_links; @@ -2532,6 +2533,14 @@ extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); +static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len) +{ + if (WARN_ON(len > sizeof(sb->s_uuid))) + len = sizeof(sb->s_uuid); + sb->s_uuid_len = len; + memcpy(&sb->s_uuid, uuid, len); +} + extern int current_umask(void); extern void ihold(struct inode * inode); -- cgit v1.2.3 From 4bcb982cce74e18155fba0d97394ca9634e0d8f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 28 Jan 2024 20:05:47 -0700 Subject: io_uring: expand main struct io_kiocb flags to 64-bits We're out of space here, and none of the flags are easily reclaimable. Bump it to 64-bits and re-arrange the struct a bit to avoid gaps. Add a specific bitwise type for the request flags, io_request_flags_t. This will help catch violations of casting this value to a smaller type on 32-bit archs, like unsigned int. This creates a hole in the io_kiocb, so move nr_tw up and rsrc_node down to retain needing only cacheline 0 and 1 for non-polled opcodes. No functional changes intended in this patch. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 77 +++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 854ad67a5f70..56bf733d3ee6 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -468,70 +468,73 @@ enum { __REQ_F_LAST_BIT, }; +typedef u64 __bitwise io_req_flags_t; +#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno))) + enum { /* ctx owns file */ - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), /* drain existing IO first */ - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), /* linked sqes */ - REQ_F_LINK = BIT(REQ_F_LINK_BIT), + REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT), /* doesn't sever on completion < 0 */ - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT), /* IOSQE_ASYNC */ - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), /* IOSQE_BUFFER_SELECT */ - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), /* IOSQE_CQE_SKIP_SUCCESS */ - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), + REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), /* fail rest of links */ - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), + REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT), /* on inflight list, should be cancelled and waited on exit reliably */ - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), /* read/write uses file position */ - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT), /* must not punt to workers */ - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT), /* has or had linked timeout */ - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), /* needs cleanup */ - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), + REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), /* already went through poll handler */ - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), /* buffer already selected */ - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), /* buffer selected from ring, needs commit */ - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), + REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), /* caller should reissue async */ - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), + REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT), /* supports async reads/writes */ - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), + REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT), /* has creds assigned */ - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT), /* skip refcounting if not set */ - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), /* ->async_data allocated */ - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), /* don't post CQEs while failing linked requests */ - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), /* single poll may be active */ - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), /* request has already done partial IO */ - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + REQ_F_PARTIAL_IO = IO_REQ_FLAG(REQ_F_PARTIAL_IO_BIT), /* fast poll multishot mode */ - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), + REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), /* recvmsg special flag, clear EPOLLIN */ - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), + REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ - REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), + REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ - REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT), + REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); @@ -592,15 +595,17 @@ struct io_kiocb { * and after selection it points to the buffer ID itself. */ u16 buf_index; - unsigned int flags; + + unsigned nr_tw; + + /* REQ_F_* flags */ + io_req_flags_t flags; struct io_cqe cqe; struct io_ring_ctx *ctx; struct task_struct *task; - struct io_rsrc_node *rsrc_node; - union { /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; @@ -621,10 +626,12 @@ struct io_kiocb { /* cache ->apoll->events */ __poll_t apoll_events; }; + + struct io_rsrc_node *rsrc_node; + atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; - unsigned nr_tw; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; /* internal polling, see IORING_FEAT_FAST_POLL */ -- cgit v1.2.3 From 521223d7c229f83915619f888c99e952f24dc39f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 28 Jan 2024 20:11:55 -0700 Subject: io_uring/cancel: don't default to setting req->work.cancel_seq Just leave it unset by default, avoiding dipping into the last cacheline (which is otherwise untouched) for the fast path of using poll to drive networked traffic. Add a flag that tells us if the sequence is valid or not, and then we can defer actually assigning the flag and sequence until someone runs cancelations. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 56bf733d3ee6..e19698daae1a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -463,6 +463,7 @@ enum { REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, REQ_F_POLL_NO_LAZY_BIT, + REQ_F_CANCEL_SEQ_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -535,6 +536,8 @@ enum { REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), + /* cancel sequence is set and valid */ + REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); -- cgit v1.2.3 From 95041b93e90a06bb613ec4bef9cd4d61570f68e4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 28 Jan 2024 20:08:24 -0700 Subject: io_uring: add io_file_can_poll() helper This adds a flag to avoid dipping dereferencing file and then f_op to figure out if the file has a poll handler defined or not. We generally call this at least twice for networked workloads, and if using ring provided buffers, we do it on every buffer selection. Particularly the latter is troublesome, as it's otherwise a very fast operation. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e19698daae1a..4ddc7b3168f3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -464,6 +464,7 @@ enum { REQ_F_ISREG_BIT, REQ_F_POLL_NO_LAZY_BIT, REQ_F_CANCEL_SEQ_BIT, + REQ_F_CAN_POLL_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -538,6 +539,8 @@ enum { REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), /* cancel sequence is set and valid */ REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), + /* file is pollable */ + REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); -- cgit v1.2.3 From da08d2edb020026beac01d087d3b37e479fdb7e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Feb 2024 09:28:52 -0700 Subject: io_uring: re-arrange struct io_ring_ctx to reduce padding Nothing major here, just moving a few things around to reduce the padding. This reduces the size on a non-debug kernel from 1536 to 1472 bytes, saving a full cacheline. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4ddc7b3168f3..addfcc74d851 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -240,6 +240,7 @@ struct io_ring_ctx { unsigned int poll_activated: 1; unsigned int drain_disabled: 1; unsigned int compat: 1; + unsigned int iowq_limits_set : 1; struct task_struct *submitter_task; struct io_rings *rings; @@ -274,10 +275,20 @@ struct io_ring_ctx { */ struct io_rsrc_node *rsrc_node; atomic_t cancel_seq; + + /* + * ->iopoll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + bool poll_multi_queue; + struct io_wq_work_list iopoll_list; + struct io_file_table file_table; + struct io_mapped_ubuf **user_bufs; unsigned nr_user_files; unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; struct io_submit_state submit_state; @@ -288,15 +299,6 @@ struct io_ring_ctx { struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; - /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - bool poll_multi_queue; - /* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() @@ -343,8 +345,8 @@ struct io_ring_ctx { spinlock_t completion_lock; /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; + struct io_wq_work_list locked_free_list; struct list_head io_buffers_comp; struct list_head cq_overflow_list; @@ -366,9 +368,6 @@ struct io_ring_ctx { unsigned int file_alloc_start; unsigned int file_alloc_end; - struct xarray personalities; - u32 pers_next; - struct list_head io_buffers_cache; /* deferred free list, protected by ->uring_lock */ @@ -389,6 +388,9 @@ struct io_ring_ctx { struct wait_queue_head rsrc_quiesce_wq; unsigned rsrc_quiesce; + u32 pers_next; + struct xarray personalities; + /* hashed buffered write serialization */ struct io_wq_hash *hash_map; @@ -405,7 +407,6 @@ struct io_ring_ctx { /* io-wq management, e.g. thread count */ u32 iowq_limits[2]; - bool iowq_limits_set; struct callback_head poll_wq_task_work; struct list_head defer_list; -- cgit v1.2.3 From d1ff85fdf0b8f63a6e042ae7559c630f9b1c50e2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Feb 2024 21:21:52 +0100 Subject: spi: pl022: Use typedef for dma_filter_fn Use existing typedef for dma_filter_fn to avoid duplicating type definition. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240208202154.630336-1-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- include/linux/amba/pl022.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/amba/pl022.h b/include/linux/amba/pl022.h index 9bf58aac0df2..e08488df6d28 100644 --- a/include/linux/amba/pl022.h +++ b/include/linux/amba/pl022.h @@ -16,6 +16,7 @@ #ifndef _SSP_PL022_H #define _SSP_PL022_H +#include #include /** @@ -235,7 +236,7 @@ struct dma_chan; struct pl022_ssp_controller { u16 bus_id; u8 enable_dma:1; - bool (*dma_filter)(struct dma_chan *chan, void *filter_param); + dma_filter_fn dma_filter; void *dma_rx_param; void *dma_tx_param; int autosuspend_delay; -- cgit v1.2.3 From c42d9bead493854507e1a180942ebe33c9180598 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Feb 2024 21:21:53 +0100 Subject: spi: pl022: Add missing dma_filter field kerneldoc Add kerneldoc for dma_filter field in struct pl022_ssp_controller. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240208202154.630336-2-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- include/linux/amba/pl022.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/amba/pl022.h b/include/linux/amba/pl022.h index e08488df6d28..d7b07d0311e1 100644 --- a/include/linux/amba/pl022.h +++ b/include/linux/amba/pl022.h @@ -225,6 +225,7 @@ struct dma_chan; * struct pl022_ssp_master - device.platform_data for SPI controller devices. * @bus_id: identifier for this bus * @enable_dma: if true enables DMA driven transfers. + * @dma_filter: callback filter for dma_request_channel. * @dma_rx_param: parameter to locate an RX DMA channel. * @dma_tx_param: parameter to locate a TX DMA channel. * @autosuspend_delay: delay in ms following transfer completion before the -- cgit v1.2.3 From 3d4dd10b376e1b8b6d0409f7e7b752f9baa51c24 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Feb 2024 21:21:54 +0100 Subject: spi: pxa2xx: Use typedef for dma_filter_fn Use existing typedef for dma_filter_fn to avoid duplicating type definition. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240208202154.630336-3-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- include/linux/spi/pxa2xx_spi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h index 0916cb9bcb0a..ca2cd4e30ead 100644 --- a/include/linux/spi/pxa2xx_spi.h +++ b/include/linux/spi/pxa2xx_spi.h @@ -5,6 +5,7 @@ #ifndef __LINUX_SPI_PXA2XX_SPI_H #define __LINUX_SPI_PXA2XX_SPI_H +#include #include #include @@ -22,7 +23,7 @@ struct pxa2xx_spi_controller { bool is_target; /* DMA engine specific config */ - bool (*dma_filter)(struct dma_chan *chan, void *param); + dma_filter_fn dma_filter; void *tx_param; void *rx_param; -- cgit v1.2.3 From e891becdccaa9048b1ab91c08ad5722edd571806 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 7 Feb 2024 22:39:14 +0100 Subject: PCI: endpoint: Refactor pci_epf_alloc_space() API Refactor pci_epf_alloc_space() API to accept "epc_features" as a parameter. This is a preparatory work to make the API more robust. Reviewed-by: Frank Li Reviewed-by: Manivannan Sadhasivam Signed-off-by: Niklas Cassel Link: https://lore.kernel.org/r/20240207213922.1796533-2-cassel@kernel.org [mani: reworded commit message] Signed-off-by: Manivannan Sadhasivam --- include/linux/pci-epf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 77b146e0f672..adee6a1b35db 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -15,6 +15,7 @@ #include struct pci_epf; +struct pci_epc_features; enum pci_epc_interface_type; enum pci_barno { @@ -216,7 +217,8 @@ int __pci_epf_register_driver(struct pci_epf_driver *driver, struct module *owner); void pci_epf_unregister_driver(struct pci_epf_driver *driver); void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, - size_t align, enum pci_epc_interface_type type); + const struct pci_epc_features *epc_features, + enum pci_epc_interface_type type); void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar, enum pci_epc_interface_type type); int pci_epf_bind(struct pci_epf *epf); -- cgit v1.2.3 From 7d708c145b2631941b8b0b4a740dc2990818c39c Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Fri, 9 Feb 2024 01:24:54 +0000 Subject: Revert "usb: dwc3: Support EBC feature of DWC_usb31" This reverts commit 398aa9a7e77cf23c2a6f882ddd3dcd96f21771dc. The update to the gadget API to support EBC feature is incomplete. It's missing at least the following: * New usage documentation * Gadget capability check * Condition for the user to check how many and which endpoints can be used as "fifo_mode" * Description of how it can affect completed request (e.g. dwc3 won't update TRB on completion -- ie. how it can affect request's actual length report) Let's revert this until it's ready. Fixes: 398aa9a7e77c ("usb: dwc3: Support EBC feature of DWC_usb31") Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/3042f847ff904b4dd4e4cf66a1b9df470e63439e.1707441690.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index a771ccc038ac..6532beb587b1 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -236,7 +236,6 @@ struct usb_ep { unsigned max_streams:16; unsigned mult:2; unsigned maxburst:5; - unsigned fifo_mode:1; u8 address; const struct usb_endpoint_descriptor *desc; const struct usb_ss_ep_comp_descriptor *comp_desc; -- cgit v1.2.3 From e3caf184107a4e2e196528b98b218ddc41e4cb8c Mon Sep 17 00:00:00 2001 From: Jinjian Song Date: Mon, 5 Feb 2024 18:22:27 +0800 Subject: wwan: core: Add WWAN fastboot port type Add a new WWAN port that connects to the device fastboot protocol interface. Signed-off-by: Jinjian Song Signed-off-by: David S. Miller --- include/linux/wwan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 01fa15506286..170fdee6339c 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -16,6 +16,7 @@ * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface * @WWAN_PORT_FIREHOSE: XML based command protocol * @WWAN_PORT_XMMRPC: Control protocol for Intel XMM modems + * @WWAN_PORT_FASTBOOT: Fastboot protocol control * * @WWAN_PORT_MAX: Highest supported port types * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type @@ -28,6 +29,7 @@ enum wwan_port_type { WWAN_PORT_QCDM, WWAN_PORT_FIREHOSE, WWAN_PORT_XMMRPC, + WWAN_PORT_FASTBOOT, /* Add new port types above this line */ -- cgit v1.2.3 From 6f35fe5d8a0ad0125c52fb20f5d67000e369eb3a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Mon, 5 Feb 2024 11:56:01 +0000 Subject: iommu/amd: Introduce get_amd_iommu_from_dev() Introduce get_amd_iommu_from_dev() and get_amd_iommu_from_dev_data(). And replace rlookup_amd_iommu() with the new helper function where applicable to avoid unnecessary loop to look up struct amd_iommu from struct device. Suggested-by: Jason Gunthorpe Signed-off-by: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1ea2a820e1eb..32bb121e8032 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -654,6 +654,22 @@ static inline struct iommu_device *dev_to_iommu_device(struct device *dev) return (struct iommu_device *)dev_get_drvdata(dev); } +/** + * iommu_get_iommu_dev - Get iommu_device for a device + * @dev: an end-point device + * + * Note that this function must be called from the iommu_ops + * to retrieve the iommu_device for a device, which the core code + * guarentees it will not invoke the op without an attached iommu. + */ +static inline struct iommu_device *__iommu_get_iommu_dev(struct device *dev) +{ + return dev->iommu->iommu_dev; +} + +#define iommu_get_iommu_dev(dev, type, member) \ + container_of(__iommu_get_iommu_dev(dev), type, member) + static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) { *gather = (struct iommu_iotlb_gather) { -- cgit v1.2.3 From bf8aff2945ba4091f503df673b9df33002546e6a Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 5 Feb 2024 11:56:07 +0000 Subject: iommu: Introduce iommu_group_mutex_assert() Add function to check iommu group mutex lock. So that device drivers can rely on group mutex lock instead of adding another driver level lock before modifying driver specific device data structure. Suggested-by: Jason Gunthorpe Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240205115615.6053-10-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 32bb121e8032..8141a37556d5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1356,6 +1356,14 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) +void iommu_group_mutex_assert(struct device *dev); +#else +static inline void iommu_group_mutex_assert(struct device *dev) +{ +} +#endif + /** * iommu_map_sgtable - Map the given buffer to the IOMMU domain * @domain: The IOMMU domain to perform the mapping -- cgit v1.2.3 From fd7eea27a3aed79b63b1726c00bde0d50cf207e2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Feb 2024 16:48:43 +0100 Subject: Compiler Attributes: Add __uninitialized macro With INIT_STACK_ALL_PATTERN or INIT_STACK_ALL_ZERO enabled the kernel will be compiled with -ftrivial-auto-var-init=<...> which causes initialization of stack variables at function entry time. In order to avoid the performance impact that comes with this users can use the "uninitialized" attribute to prevent such initialization. Therefore provide the __uninitialized macro which can be used for cases where INIT_STACK_ALL_PATTERN or INIT_STACK_ALL_ZERO is enabled, but only selected variables should not be initialized. Acked-by: Kees Cook Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240205154844.3757121-2-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- include/linux/compiler_attributes.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 28566624f008..f5859b8c68b4 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -333,6 +333,18 @@ */ #define __section(section) __attribute__((__section__(section))) +/* + * Optional: only supported since gcc >= 12 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-uninitialized-variable-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#uninitialized + */ +#if __has_attribute(__uninitialized__) +# define __uninitialized __attribute__((__uninitialized__)) +#else +# define __uninitialized +#endif + /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-unused-function-attribute * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-unused-type-attribute -- cgit v1.2.3 From fc5b764bef24d0cf722deb5c1a44948cd17d4afe Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Feb 2024 18:54:23 +0200 Subject: spi: gpio: Follow renaming of SPI "master" to "controller" In commit 8caab75fd2c2 ("spi: Generalize SPI "master" to "controller"") some functions and struct members were renamed. Recent work by Uwe completes this renaming. However, there are plenty of leftovers in the comments and in-code documentation. Update them as well. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240209165423.2305493-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi_gpio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi_gpio.h b/include/linux/spi/spi_gpio.h index 9e7e83d8645b..5f0e1407917a 100644 --- a/include/linux/spi/spi_gpio.h +++ b/include/linux/spi/spi_gpio.h @@ -15,8 +15,8 @@ */ /** - * struct spi_gpio_platform_data - parameter for bitbanged SPI master - * @num_chipselect: how many slaves to allow + * struct spi_gpio_platform_data - parameter for bitbanged SPI host controller + * @num_chipselect: how many target devices to allow */ struct spi_gpio_platform_data { u16 num_chipselect; -- cgit v1.2.3 From 8d0c12a80cdeb80d5e0510e96d38fe551ed8e9b5 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 8 Jun 2023 09:38:36 -0700 Subject: io-uring: add napi busy poll support This adds the napi busy polling support in io_uring.c. It adds a new napi_list to the io_ring_ctx structure. This list contains the list of napi_id's that are currently enabled for busy polling. The list is synchronized by the new napi_lock spin lock. The current default napi busy polling time is stored in napi_busy_poll_to. If napi busy polling is not enabled, the value is 0. In addition there is also a hash table. The hash table store the napi id and the pointer to the above list nodes. The hash table is used to speed up the lookup to the list elements. The hash table is synchronized with rcu. The NAPI_TIMEOUT is stored as a timeout to make sure that the time a napi entry is stored in the napi list is limited. The busy poll timeout is also stored as part of the io_wait_queue. This is necessary as for sq polling the poll interval needs to be adjusted and the napi callback allows only to pass in one value. This has been tested with two simple programs from the liburing library repository: the napi client and the napi server program. The client sends a request, which has a timestamp in its payload and the server replies with the same payload. The client calculates the roundtrip time and stores it to calculate the results. The client is running on host1 and the server is running on host 2 (in the same rack). The measured times below are roundtrip times. They are average times over 5 runs each. Each run measures 1 million roundtrips. no rx coal rx coal: frames=88,usecs=33 Default 57us 56us client_poll=100us 47us 46us server_poll=100us 51us 46us client_poll=100us+ 40us 40us server_poll=100us client_poll=100us+ 41us 39us server_poll=100us+ prefer napi busy poll on client client_poll=100us+ 41us 39us server_poll=100us+ prefer napi busy poll on server client_poll=100us+ 41us 39us server_poll=100us+ prefer napi busy poll on client + server Signed-off-by: Stefan Roesch Suggested-by: Olivier Langlois Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230608163839.2891748-5-shr@devkernel.io Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index addfcc74d851..4fe7af8a4907 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -2,6 +2,7 @@ #define IO_URING_TYPES_H #include +#include #include #include #include @@ -247,6 +248,7 @@ struct io_ring_ctx { struct percpu_ref refs; enum task_work_notify_mode notify_method; + unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; /* submission data */ @@ -410,7 +412,18 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; - unsigned sq_thread_idle; + +#ifdef CONFIG_NET_RX_BUSY_POLL + struct list_head napi_list; /* track busy poll napi_id */ + spinlock_t napi_lock; /* napi_list lock */ + + /* napi busy poll default timeout */ + unsigned int napi_busy_poll_to; + bool napi_prefer_busy_poll; + + DECLARE_HASHTABLE(napi_ht, 4); +#endif + /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; -- cgit v1.2.3 From 8f172181f24bb5df7675225d9b5b66d059613f50 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Feb 2024 14:11:56 -1000 Subject: workqueue: Implement workqueue_set_min_active() Since 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues"), unbound workqueues have separate min_active which sets the number of interdependent work items that can be handled. This value is currently initialized to WQ_DFL_MIN_ACTIVE which is 8. This isn't high enough for some users, let's add an interface to adjust the setting. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4ba33cf07f11..1565bab9edc8 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -553,6 +553,8 @@ extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); +extern void workqueue_set_min_active(struct workqueue_struct *wq, + int min_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); -- cgit v1.2.3 From bf52b1ac6ab41a060511d56d0f2da12f3a2486db Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Feb 2024 14:14:16 -1000 Subject: async: Use a dedicated unbound workqueue with raised min_active Async can schedule a number of interdependent work items. However, since 5797b1c18919 ("workqueue: Implement system-wide nr_active enforcement for unbound workqueues"), unbound workqueues have separate min_active which sets the number of interdependent work items that can be handled. This default value is 8 which isn't sufficient for async and can lead to stalls during resume from suspend in some cases. Let's use a dedicated unbound workqueue with raised min_active. Link: http://lkml.kernel.org/r/708a65cc-79ec-44a6-8454-a93d0f3114c3@samsung.com Reported-by: Marek Szyprowski Cc: Rafael J. Wysocki Tested-by: Marek Szyprowski Signed-off-by: Tejun Heo --- include/linux/async.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/async.h b/include/linux/async.h index 33c9ff4afb49..19b778d08600 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -120,4 +120,5 @@ extern void async_synchronize_cookie(async_cookie_t cookie); extern void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain); extern bool current_is_async(void); +extern void async_init(void); #endif -- cgit v1.2.3 From 4356e9f841f7fbb945521cef3577ba394c65f3fc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 9 Feb 2024 12:39:31 -0800 Subject: work around gcc bugs with 'asm goto' with outputs We've had issues with gcc and 'asm goto' before, and we created a 'asm_volatile_goto()' macro for that in the past: see commits 3f0116c3238a ("compiler/gcc4: Add quirk for 'asm goto' miscompilation bug") and a9f180345f53 ("compiler/gcc4: Make quirk for asm_volatile_goto() unconditional"). Then, much later, we ended up removing the workaround in commit 43c249ea0b1e ("compiler-gcc.h: remove ancient workaround for gcc PR 58670") because we no longer supported building the kernel with the affected gcc versions, but we left the macro uses around. Now, Sean Christopherson reports a new version of a very similar problem, which is fixed by re-applying that ancient workaround. But the problem in question is limited to only the 'asm goto with outputs' cases, so instead of re-introducing the old workaround as-is, let's rename and limit the workaround to just that much less common case. It looks like there are at least two separate issues that all hit in this area: (a) some versions of gcc don't mark the asm goto as 'volatile' when it has outputs: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 which is easy to work around by just adding the 'volatile' by hand. (b) Internal compiler errors: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 which are worked around by adding the extra empty 'asm' as a barrier, as in the original workaround. but the problem Sean sees may be a third thing since it involves bad code generation (not an ICE) even with the manually added 'volatile'. but the same old workaround works for this case, even if this feels a bit like voodoo programming and may only be hiding the issue. Reported-and-tested-by: Sean Christopherson Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/ Cc: Nick Desaulniers Cc: Uros Bizjak Cc: Jakub Jelinek Cc: Andrew Pinski Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 19 +++++++++++++++++++ include/linux/compiler_types.h | 4 ++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index aebb65bf95a7..c1a963be7d28 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -64,6 +64,25 @@ __builtin_unreachable(); \ } while (0) +/* + * GCC 'asm goto' with outputs miscompiles certain code sequences: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 + * + * Work it around via the same compiler barrier quirk that we used + * to use for the old 'asm goto' workaround. + * + * Also, always mark such 'asm goto' statements as volatile: all + * asm goto statements are supposed to be volatile as per the + * documentation, but some versions of gcc didn't actually do + * that for asms with outputs: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 + */ +#define asm_goto_output(x...) \ + do { asm volatile goto(x); asm (""); } while (0) + #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6f1ca49306d2..663d8791c871 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -362,8 +362,8 @@ struct ftrace_likely_data { #define __member_size(p) __builtin_object_size(p, 1) #endif -#ifndef asm_volatile_goto -#define asm_volatile_goto(x...) asm goto(x) +#ifndef asm_goto_output +#define asm_goto_output(x...) asm goto(x) #endif #ifdef CONFIG_CC_HAS_ASM_INLINE -- cgit v1.2.3 From 7e4a205fe56b9092f0143dad6aa5fee081139b09 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Feb 2024 23:53:05 -0500 Subject: Revert "get rid of DCACHE_GENOCIDE" This reverts commit 57851607326a2beef21e67f83f4f53a90df8445a. Unfortunately, while we only call that thing once, the callback *can* be called more than once for the same dentry - all it takes is rename_lock being touched while we are in d_walk(). For now let's revert it. Signed-off-by: Al Viro --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1666c387861f..d07cf2f1bb7d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -173,6 +173,7 @@ struct dentry_operations { #define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ #define DCACHE_CANT_MOUNT BIT(8) +#define DCACHE_GENOCIDE BIT(9) #define DCACHE_SHRINK_LIST BIT(10) #define DCACHE_OP_WEAK_REVALIDATE BIT(11) -- cgit v1.2.3 From 471e8fd3afcef5a9f9089f0bd21965ad9ba35c91 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 6 Feb 2024 18:31:06 +0100 Subject: net: phy: add devm/of_phy_package_join helper Add devm/of_phy_package_join helper to join PHYs in a PHY package. These are variant of the manual phy_package_join with the difference that these will use DT nodes to derive the base_addr instead of manually passing an hardcoded value. An additional value is added in phy_package_shared, "np" to reference the PHY package node pointer in specific PHY driver probe_once and config_init_once functions to make use of additional specific properties defined in the PHY package node in DT. The np value is filled only with of_phy_package_join if a valid PHY package node is found. A valid PHY package node must have the node name set to "ethernet-phy-package". Signed-off-by: Christian Marangi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index fd8dbea9b4d9..cbd49418b819 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -329,6 +329,7 @@ struct mdio_bus_stats { * struct phy_package_shared - Shared information in PHY packages * @base_addr: Base PHY address of PHY package used to combine PHYs * in one package and for offset calculation of phy_package_read/write + * @np: Pointer to the Device Node if PHY package defined in DT * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv @@ -340,6 +341,8 @@ struct mdio_bus_stats { */ struct phy_package_shared { u8 base_addr; + /* With PHY package defined in DT this points to the PHY package node */ + struct device_node *np; refcount_t refcnt; unsigned long flags; size_t priv_size; @@ -2000,9 +2003,12 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size); +int of_phy_package_join(struct phy_device *phydev, size_t priv_size); void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, int base_addr, size_t priv_size); +int devm_of_phy_package_join(struct device *dev, struct phy_device *phydev, + size_t priv_size); int __init mdio_bus_init(void); void mdio_bus_exit(void); -- cgit v1.2.3 From 9b1d5e055508393561e26bd1720f4c2639b03b1a Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 6 Feb 2024 18:31:09 +0100 Subject: net: phy: provide whether link has changed in c37_read_status Some PHY driver might require additional regs call after genphy_c37_read_status() is called. Expand genphy_c37_read_status to provide a bool wheather the link has changed or not to permit PHY driver to skip additional regs call if nothing has changed. Every user of genphy_c37_read_status() is updated with the new additional bool. Signed-off-by: Christian Marangi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index cbd49418b819..2249cdb5957a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1876,7 +1876,7 @@ int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, /* Clause 37 */ int genphy_c37_config_aneg(struct phy_device *phydev); -int genphy_c37_read_status(struct phy_device *phydev); +int genphy_c37_read_status(struct phy_device *phydev, bool *changed); /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From b9f55084aa0962af6247fa971f0b1e9c7aa676ef Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 17 Jan 2024 13:44:05 +0200 Subject: hwmon: put HWMON_CHANNEL_INFO() initializers in rodata HWMON_CHANNEL_INFO() is supposed to be used as initializer for arrays of const struct hwmon_channel_info *. However, without explicit const, HWMON_CHANNEL_INFO() creates mutable compound literals, and the const pointers point at the mutable data. Add const to place the data in rodata. Cc: Jean Delvare Cc: Guenter Roeck Signed-off-by: Jani Nikula Link: https://lore.kernel.org/r/20240117114405.1506775-1-jani.nikula@intel.com Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 8cd6a6b33593..c2c0da18dfa3 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -425,12 +425,12 @@ struct hwmon_channel_info { const u32 *config; }; -#define HWMON_CHANNEL_INFO(stype, ...) \ - (&(struct hwmon_channel_info) { \ - .type = hwmon_##stype, \ - .config = (u32 []) { \ - __VA_ARGS__, 0 \ - } \ +#define HWMON_CHANNEL_INFO(stype, ...) \ + (&(const struct hwmon_channel_info) { \ + .type = hwmon_##stype, \ + .config = (const u32 []) { \ + __VA_ARGS__, 0 \ + } \ }) /** -- cgit v1.2.3 From 35c1bfb99fef9c71f9df5c1325db99a79300bc97 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Mon, 29 Jan 2024 17:13:24 +0100 Subject: hwmon: add fault attribute for voltage channels Sometimes a voltage channel might have an hard failure (eg: a shorted MOSFET). Hence, add a fault attribute to report such failures. Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240129-b4-ltc4282-support-v4-2-fe75798164cc@analog.com Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index c2c0da18dfa3..c7885fdce88f 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -141,6 +141,7 @@ enum hwmon_in_attributes { hwmon_in_rated_min, hwmon_in_rated_max, hwmon_in_beep, + hwmon_in_fault, }; #define HWMON_I_ENABLE BIT(hwmon_in_enable) @@ -162,6 +163,7 @@ enum hwmon_in_attributes { #define HWMON_I_RATED_MIN BIT(hwmon_in_rated_min) #define HWMON_I_RATED_MAX BIT(hwmon_in_rated_max) #define HWMON_I_BEEP BIT(hwmon_in_beep) +#define HWMON_I_FAULT BIT(hwmon_in_fault) enum hwmon_curr_attributes { hwmon_curr_enable, -- cgit v1.2.3 From 68bc61c26cacf152baf905786b5949769700f40d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 7 Feb 2024 13:26:17 +0100 Subject: bpf: Allow compiler to inline most of bpf_local_storage_lookup() In various performance profiles of kernels with BPF programs attached, bpf_local_storage_lookup() appears as a significant portion of CPU cycles spent. To enable the compiler generate more optimal code, turn bpf_local_storage_lookup() into a static inline function, where only the cache insertion code path is outlined Notably, outlining cache insertion helps avoid bloating callers by duplicating setting up calls to raw_spin_{lock,unlock}_irqsave() (on architectures which do not inline spin_lock/unlock, such as x86), which would cause the compiler produce worse code by deciding to outline otherwise inlinable functions. The call overhead is neutral, because we make 2 calls either way: either calling raw_spin_lock_irqsave() and raw_spin_unlock_irqsave(); or call __bpf_local_storage_insert_cache(), which calls raw_spin_lock_irqsave(), followed by a tail-call to raw_spin_unlock_irqsave() where the compiler can perform TCO and (in optimized uninstrumented builds) turns it into a plain jump. The call to __bpf_local_storage_insert_cache() can be elided entirely if cacheit_lockit is a false constant expression. Based on results from './benchs/run_bench_local_storage.sh' (21 trials, reboot between each trial; x86 defconfig + BPF, clang 16) this produces improvements in throughput and latency in the majority of cases, with an average (geomean) improvement of 8%: +---- Hashmap Control -------------------- | | + num keys: 10 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 14.789 M ops/s | 14.745 M ops/s ( ~ ) | +- hits latency | 67.679 ns/op | 67.879 ns/op ( ~ ) | +- important_hits throughput | 14.789 M ops/s | 14.745 M ops/s ( ~ ) | | + num keys: 1000 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 12.233 M ops/s | 12.170 M ops/s ( ~ ) | +- hits latency | 81.754 ns/op | 82.185 ns/op ( ~ ) | +- important_hits throughput | 12.233 M ops/s | 12.170 M ops/s ( ~ ) | | + num keys: 10000 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 7.220 M ops/s | 7.204 M ops/s ( ~ ) | +- hits latency | 138.522 ns/op | 138.842 ns/op ( ~ ) | +- important_hits throughput | 7.220 M ops/s | 7.204 M ops/s ( ~ ) | | + num keys: 100000 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 5.061 M ops/s | 5.165 M ops/s (+2.1%) | +- hits latency | 198.483 ns/op | 194.270 ns/op (-2.1%) | +- important_hits throughput | 5.061 M ops/s | 5.165 M ops/s (+2.1%) | | + num keys: 4194304 | : | | +-+ hashmap (control) sequential get +----------------------+---------------------- | +- hits throughput | 2.864 M ops/s | 2.882 M ops/s ( ~ ) | +- hits latency | 365.220 ns/op | 361.418 ns/op (-1.0%) | +- important_hits throughput | 2.864 M ops/s | 2.882 M ops/s ( ~ ) | +---- Local Storage ---------------------- | | + num_maps: 1 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 33.005 M ops/s | 39.068 M ops/s (+18.4%) | +- hits latency | 30.300 ns/op | 25.598 ns/op (-15.5%) | +- important_hits throughput | 33.005 M ops/s | 39.068 M ops/s (+18.4%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 37.151 M ops/s | 44.926 M ops/s (+20.9%) | +- hits latency | 26.919 ns/op | 22.259 ns/op (-17.3%) | +- important_hits throughput | 37.151 M ops/s | 44.926 M ops/s (+20.9%) | | + num_maps: 10 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 32.288 M ops/s | 38.099 M ops/s (+18.0%) | +- hits latency | 30.972 ns/op | 26.248 ns/op (-15.3%) | +- important_hits throughput | 3.229 M ops/s | 3.810 M ops/s (+18.0%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 34.473 M ops/s | 41.145 M ops/s (+19.4%) | +- hits latency | 29.010 ns/op | 24.307 ns/op (-16.2%) | +- important_hits throughput | 12.312 M ops/s | 14.695 M ops/s (+19.4%) | | + num_maps: 16 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 32.524 M ops/s | 38.341 M ops/s (+17.9%) | +- hits latency | 30.748 ns/op | 26.083 ns/op (-15.2%) | +- important_hits throughput | 2.033 M ops/s | 2.396 M ops/s (+17.9%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 34.575 M ops/s | 41.338 M ops/s (+19.6%) | +- hits latency | 28.925 ns/op | 24.193 ns/op (-16.4%) | +- important_hits throughput | 11.001 M ops/s | 13.153 M ops/s (+19.6%) | | + num_maps: 17 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 28.861 M ops/s | 32.756 M ops/s (+13.5%) | +- hits latency | 34.649 ns/op | 30.530 ns/op (-11.9%) | +- important_hits throughput | 1.700 M ops/s | 1.929 M ops/s (+13.5%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 31.529 M ops/s | 36.110 M ops/s (+14.5%) | +- hits latency | 31.719 ns/op | 27.697 ns/op (-12.7%) | +- important_hits throughput | 9.598 M ops/s | 10.993 M ops/s (+14.5%) | | + num_maps: 24 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 18.602 M ops/s | 19.937 M ops/s (+7.2%) | +- hits latency | 53.767 ns/op | 50.166 ns/op (-6.7%) | +- important_hits throughput | 0.776 M ops/s | 0.831 M ops/s (+7.2%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 21.718 M ops/s | 23.332 M ops/s (+7.4%) | +- hits latency | 46.047 ns/op | 42.865 ns/op (-6.9%) | +- important_hits throughput | 6.110 M ops/s | 6.564 M ops/s (+7.4%) | | + num_maps: 32 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 14.118 M ops/s | 14.626 M ops/s (+3.6%) | +- hits latency | 70.856 ns/op | 68.381 ns/op (-3.5%) | +- important_hits throughput | 0.442 M ops/s | 0.458 M ops/s (+3.6%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 17.111 M ops/s | 17.906 M ops/s (+4.6%) | +- hits latency | 58.451 ns/op | 55.865 ns/op (-4.4%) | +- important_hits throughput | 4.776 M ops/s | 4.998 M ops/s (+4.6%) | | + num_maps: 100 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 5.281 M ops/s | 5.528 M ops/s (+4.7%) | +- hits latency | 192.398 ns/op | 183.059 ns/op (-4.9%) | +- important_hits throughput | 0.053 M ops/s | 0.055 M ops/s (+4.9%) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 6.265 M ops/s | 6.498 M ops/s (+3.7%) | +- hits latency | 161.436 ns/op | 152.877 ns/op (-5.3%) | +- important_hits throughput | 1.636 M ops/s | 1.697 M ops/s (+3.7%) | | + num_maps: 1000 | : | | +-+ local_storage cache sequential get +----------------------+---------------------- | +- hits throughput | 0.355 M ops/s | 0.354 M ops/s ( ~ ) | +- hits latency | 2826.538 ns/op | 2827.139 ns/op ( ~ ) | +- important_hits throughput | 0.000 M ops/s | 0.000 M ops/s ( ~ ) | : | : | | +-+ local_storage cache interleaved get +----------------------+---------------------- | +- hits throughput | 0.404 M ops/s | 0.403 M ops/s ( ~ ) | +- hits latency | 2481.190 ns/op | 2487.555 ns/op ( ~ ) | +- important_hits throughput | 0.102 M ops/s | 0.101 M ops/s ( ~ ) The on_lookup test in {cgrp,task}_ls_recursion.c is removed because the bpf_local_storage_lookup is no longer traceable and adding tracepoint will make the compiler generate worse code: https://lore.kernel.org/bpf/ZcJmok64Xqv6l4ZS@elver.google.com/ Signed-off-by: Marco Elver Cc: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240207122626.3508658-1-elver@google.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf_local_storage.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 173ec7f43ed1..dcddb0aef7d8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -129,10 +129,36 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, bool bpf_ma); -struct bpf_local_storage_data * +void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); +/* If cacheit_lockit is false, this lookup function is lockless */ +static inline struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, - bool cacheit_lockit); + bool cacheit_lockit) +{ + struct bpf_local_storage_data *sdata; + struct bpf_local_storage_elem *selem; + + /* Fast path (cache hit) */ + sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], + bpf_rcu_lock_held()); + if (sdata && rcu_access_pointer(sdata->smap) == smap) + return sdata; + + /* Slow path (cache miss) */ + hlist_for_each_entry_rcu(selem, &local_storage->list, snode, + rcu_read_lock_trace_held()) + if (rcu_access_pointer(SDATA(selem)->smap) == smap) + break; + + if (!selem) + return NULL; + if (cacheit_lockit) + __bpf_local_storage_insert_cache(local_storage, smap, selem); + return SDATA(selem); +} void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); -- cgit v1.2.3 From faf6efd2e5e23d3319501132d9671c8606ef21bd Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Feb 2024 21:27:04 +0100 Subject: gpio: constify opaque pointer in gpio_device_find() match function The match function used in gpio_device_find() should not modify the contents of passed opaque pointer, because such modification would not be necessary for actual matching and it could lead to quite unreadable, spaghetti code. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Mika Westerberg Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko [Bartosz: fix coding style in header] Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 3a37d058cfcf..9d0023f83a57 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -629,7 +629,8 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, struct lock_class_key *request_key); struct gpio_device *gpio_device_find(void *data, - int (*match)(struct gpio_chip *gc, void *data)); + int (*match)(struct gpio_chip *gc, + const void *data)); struct gpio_device *gpio_device_find_by_label(const char *label); struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode); -- cgit v1.2.3 From 119ff04864a24470b1e531bb53e5c141aa8fefb0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Feb 2024 14:43:21 +0000 Subject: tcp: move tp->scaling_ratio to tcp_sock_read_txrx group tp->scaling_ratio is a read mostly field, used in rx and tx fast paths. Fixes: d5fed5addb2b ("tcp: reorganize tcp_sock fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: Wei Wang Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 89b290d8c8dc..168f5dca6609 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -221,6 +221,7 @@ struct tcp_sock { u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ u16 tcp_header_len; /* Bytes of tcp header to send */ + u8 scaling_ratio; /* see tcp_win_from_space() */ u8 chrono_type : 2, /* current chronograph type */ repair : 1, is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ @@ -352,7 +353,6 @@ struct tcp_sock { u32 compressed_ack_rcv_nxt; struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ - u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ -- cgit v1.2.3 From 666a877deab2bcf8fd11c962d69e687e18168a6f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Feb 2024 14:43:22 +0000 Subject: tcp: move tp->tcp_usec_ts to tcp_sock_read_txrx group tp->tcp_usec_ts is a read mostly field, used in rx and tx fast paths. Fixes: d5fed5addb2b ("tcp: reorganize tcp_sock fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: Wei Wang Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 168f5dca6609..a1c47a6d69b0 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -224,6 +224,7 @@ struct tcp_sock { u8 scaling_ratio; /* see tcp_win_from_space() */ u8 chrono_type : 2, /* current chronograph type */ repair : 1, + tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ __cacheline_group_end(tcp_sock_read_txrx); @@ -368,8 +369,7 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - tcp_usec_ts:1, /* TSval values in usec */ - unused:4; + unused:5; u8 thin_lto : 1,/* Use linear timeouts for thin streams */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ -- cgit v1.2.3 From c353c7b7ffb7ae6ed8f3339906fe33c8be6cf344 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Feb 2024 14:43:23 +0000 Subject: net-device: move lstats in net_device_read_txrx dev->lstats is notably used from loopback ndo_start_xmit() and other virtual drivers. Per cpu stats updates are dirtying per-cpu data, but the pointer itself is read-only. Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: Simon Horman Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 118c40258d07..ef7bfbb98497 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2141,6 +2141,11 @@ struct net_device { /* TXRX read-mostly hotpath */ __cacheline_group_begin(net_device_read_txrx); + union { + struct pcpu_lstats __percpu *lstats; + struct pcpu_sw_netstats __percpu *tstats; + struct pcpu_dstats __percpu *dstats; + }; unsigned int flags; unsigned short hard_header_len; netdev_features_t features; @@ -2395,11 +2400,6 @@ struct net_device { enum netdev_ml_priv_type ml_priv_type; enum netdev_stat_type pcpu_stat_type:8; - union { - struct pcpu_lstats __percpu *lstats; - struct pcpu_sw_netstats __percpu *tstats; - struct pcpu_dstats __percpu *dstats; - }; #if IS_ENABLED(CONFIG_GARP) struct garp_port __rcu *garp_port; -- cgit v1.2.3 From dd95255d44c05c9977f962bf0f2afe5e11f8ab3e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Jan 2024 13:33:32 +0100 Subject: coresight: make coresight_bustype const Now that the driver core can properly handle constant struct bus_type, move the coresight_bustype variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Suzuki K Poulose Cc: Mike Leach Cc: James Clark Cc: Leo Yan Cc: Alexander Shishkin Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/2024010531-tinfoil-avert-4a57@gregkh --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index a4cb7dd6ca23..e8b6e388218c 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -35,7 +35,7 @@ #define CORESIGHT_UNLOCK 0xc5acce55 -extern struct bus_type coresight_bustype; +extern const struct bus_type coresight_bustype; enum coresight_dev_type { CORESIGHT_DEV_TYPE_SINK, -- cgit v1.2.3 From a0fef3f05cf36338d471e8f35a9ced88a054d583 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:33 +0000 Subject: coresight: Make language around "activated" sinks consistent Activated has the specific meaning of a sink that's selected for use by the user via sysfs. But comments in some code that's shared by Perf use the same word, so in those cases change them to just say "selected" instead. With selected implying either via Perf or "activated" via sysfs. coresight_get_enabled_sink() doesn't actually get an enabled sink, it only gets an activated one, so change that too. And change the activated variable name to include "sysfs" so it can't be confused as a general status. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-3-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index e8b6e388218c..516ab45ff3c2 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -229,10 +229,12 @@ struct coresight_sysfs_link { * @refcnt: keep track of what is in use. * @orphan: true if the component has connections that haven't been linked. * @enable: 'true' if component is currently part of an active path. - * @activated: 'true' only if a _sink_ has been activated. A sink can be - * activated but not yet enabled. Enabling for a _sink_ - * happens when a source has been selected and a path is enabled - * from source to that sink. + * @sysfs_sink_activated: 'true' when a sink has been selected for use via sysfs + * by writing a 1 to the 'enable_sink' file. A sink can be + * activated but not yet enabled. Enabling for a _sink_ happens + * when a source has been selected and a path is enabled from + * source to that sink. A sink can also become enabled but not + * activated if it's used via Perf. * @ea: Device attribute for sink representation under PMU directory. * @def_sink: cached reference to default sink found for this device. * @nr_links: number of sysfs links created to other components from this @@ -252,9 +254,9 @@ struct coresight_device { struct device dev; atomic_t refcnt; bool orphan; - bool enable; /* true only if configured as part of a path */ + bool enable; /* sink specific fields */ - bool activated; /* true only if a sink is part of a path */ + bool sysfs_sink_activated; struct dev_ext_attribute *ea; struct coresight_device *def_sink; /* sysfs links between components */ -- cgit v1.2.3 From 9cae77cf23e317f31de036ced7ad2c261317dc76 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:35 +0000 Subject: coresight: Move mode to struct coresight_device Most devices use mode, so move the mode definition out of the individual devices and up to the Coresight device. This will allow the core code to also know the mode which will be useful in a later commit. This also fixes the inconsistency of the documentation of the mode field on the individual device types. For example ETB10 had "this ETB is being used". Two devices didn't require an atomic mode type, so these usages have been converted to atomic_get() and atomic_set() only to make it compile, but the documentation of the field in struct coresight_device explains this type of usage. In the future, manipulation of the mode could be completely moved out of the individual devices and into the core code because it's almost all duplicate code, and this change is a step towards that. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-5-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 516ab45ff3c2..01f67862ea2f 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -226,6 +226,11 @@ struct coresight_sysfs_link { * by @coresight_ops. * @access: Device i/o access abstraction for this device. * @dev: The device entity associated to this component. + * @mode: This tracer's mode, i.e sysFS, Perf or disabled. This is + * actually an 'enum cs_mode', but is stored in an atomic type. + * This is always accessed through local_read() and local_set(), + * but wherever it's done from within the Coresight device's lock, + * a non-atomic read would also work. * @refcnt: keep track of what is in use. * @orphan: true if the component has connections that haven't been linked. * @enable: 'true' if component is currently part of an active path. @@ -252,6 +257,7 @@ struct coresight_device { const struct coresight_ops *ops; struct csdev_access access; struct device dev; + local_t mode; atomic_t refcnt; bool orphan; bool enable; -- cgit v1.2.3 From d5e83f97eb5669bfdd894ec980083f65517df2fb Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:36 +0000 Subject: coresight: Remove the 'enable' field. 'enable', which probably should have been 'enabled', is only ever read in the core code in relation to controlling sources, and specifically only sources in sysfs mode. Confusingly it's not labelled as such and relying on it can be a source of bugs like the one fixed by commit 078dbba3f0c9 ("coresight: Fix crash when Perf and sysfs modes are used concurrently"). Most importantly, it can only be used when the coresight_mutex is held which is only done when enabling and disabling paths in sysfs mode, and not Perf mode. So to prevent its usage spreading and leaking out to other devices, remove it. It's use is equivalent to checking if the mode is currently sysfs, as due to the coresight_mutex lock, mode == CS_MODE_SYSFS can only become true or untrue when that lock is held, and when mode == CS_MODE_SYSFS the device is both enabled and in sysfs mode. The one place it was used outside of the core code is in TPDA, but that pattern is more appropriately represented using refcounts inside the device's own spinlock. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-6-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 01f67862ea2f..d1fd7070099c 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -233,7 +233,6 @@ struct coresight_sysfs_link { * a non-atomic read would also work. * @refcnt: keep track of what is in use. * @orphan: true if the component has connections that haven't been linked. - * @enable: 'true' if component is currently part of an active path. * @sysfs_sink_activated: 'true' when a sink has been selected for use via sysfs * by writing a 1 to the 'enable_sink' file. A sink can be * activated but not yet enabled. Enabling for a _sink_ happens @@ -260,7 +259,6 @@ struct coresight_device { local_t mode; atomic_t refcnt; bool orphan; - bool enable; /* sink specific fields */ bool sysfs_sink_activated; struct dev_ext_attribute *ea; -- cgit v1.2.3 From 1f5149c7751c50aba1a871143ffa6cb36af3fb49 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:37 +0000 Subject: coresight: Move all sysfs code to sysfs file At the moment the core file contains both sysfs functionality and core functionality, while the Perf mode is in a separate file in coresight-etm-perf.c Many of the functions have ambiguous names like coresight_enable_source() which actually only work in relation to the sysfs mode. To avoid further confusion, move everything that isn't core functionality into the sysfs file and append _sysfs to the ambiguous functions. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-7-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index d1fd7070099c..365b28022c5b 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -578,8 +578,8 @@ static inline bool coresight_is_percpu_sink(struct coresight_device *csdev) extern struct coresight_device * coresight_register(struct coresight_desc *desc); extern void coresight_unregister(struct coresight_device *csdev); -extern int coresight_enable(struct coresight_device *csdev); -extern void coresight_disable(struct coresight_device *csdev); +extern int coresight_enable_sysfs(struct coresight_device *csdev); +extern void coresight_disable_sysfs(struct coresight_device *csdev); extern int coresight_timeout(struct csdev_access *csa, u32 offset, int position, int value); @@ -609,8 +609,8 @@ static inline struct coresight_device * coresight_register(struct coresight_desc *desc) { return NULL; } static inline void coresight_unregister(struct coresight_device *csdev) {} static inline int -coresight_enable(struct coresight_device *csdev) { return -ENOSYS; } -static inline void coresight_disable(struct coresight_device *csdev) {} +coresight_enable_sysfs(struct coresight_device *csdev) { return -ENOSYS; } +static inline void coresight_disable_sysfs(struct coresight_device *csdev) {} static inline int coresight_timeout(struct csdev_access *csa, u32 offset, int position, int value) -- cgit v1.2.3 From 4545b38ef004a586295750ea49a505b6396a7c90 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:38 +0000 Subject: coresight: Remove atomic type from refcnt Refcnt is only ever accessed from either inside the coresight_mutex, or the device's spinlock, making the atomic type and atomic_dec_return() calls confusing and unnecessary. The only point of synchronisation outside of these two types of locks is already done with a compare and swap on 'mode', which a comment has been added for. There was one instance of refcnt being used outside of a lock in TPIU, but that can easily be fixed by making it the same as all the other devices and adding a spinlock. Potentially in the future all the refcounting and locking can be moved up into the core code, and all the mostly duplicate code from the individual devices can be removed. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-8-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 365b28022c5b..74bcec526aa9 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -230,8 +230,15 @@ struct coresight_sysfs_link { * actually an 'enum cs_mode', but is stored in an atomic type. * This is always accessed through local_read() and local_set(), * but wherever it's done from within the Coresight device's lock, - * a non-atomic read would also work. - * @refcnt: keep track of what is in use. + * a non-atomic read would also work. This is the main point of + * synchronisation between code happening inside the sysfs mode's + * coresight_mutex and outside when running in Perf mode. A compare + * and exchange swap is done to atomically claim one mode or the + * other. + * @refcnt: keep track of what is in use. Only access this outside of the + * device's spinlock when the coresight_mutex held and mode == + * CS_MODE_SYSFS. Otherwise it must be accessed from inside the + * spinlock. * @orphan: true if the component has connections that haven't been linked. * @sysfs_sink_activated: 'true' when a sink has been selected for use via sysfs * by writing a 1 to the 'enable_sink' file. A sink can be @@ -257,7 +264,7 @@ struct coresight_device { struct csdev_access access; struct device dev; local_t mode; - atomic_t refcnt; + int refcnt; bool orphan; /* sink specific fields */ bool sysfs_sink_activated; -- cgit v1.2.3 From 053ad9ad1d13f253605d7644de3aa20d958569ef Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:39 +0000 Subject: coresight: Remove unused stubs These are a bit annoying to keep up to date when the function signatures change. But if CONFIG_CORESIGHT isn't enabled, then they're not used anyway so just delete them. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-9-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 79 ----------------------------------------------- 1 file changed, 79 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 74bcec526aa9..ecf4b8aecca8 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -391,8 +391,6 @@ struct coresight_ops { const struct coresight_ops_helper *helper_ops; }; -#if IS_ENABLED(CONFIG_CORESIGHT) - static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa, u32 offset) { @@ -611,83 +609,6 @@ void coresight_relaxed_write64(struct coresight_device *csdev, u64 val, u32 offset); void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset); -#else -static inline struct coresight_device * -coresight_register(struct coresight_desc *desc) { return NULL; } -static inline void coresight_unregister(struct coresight_device *csdev) {} -static inline int -coresight_enable_sysfs(struct coresight_device *csdev) { return -ENOSYS; } -static inline void coresight_disable_sysfs(struct coresight_device *csdev) {} - -static inline int coresight_timeout(struct csdev_access *csa, u32 offset, - int position, int value) -{ - return 1; -} - -static inline int coresight_claim_device_unlocked(struct coresight_device *csdev) -{ - return -EINVAL; -} - -static inline int coresight_claim_device(struct coresight_device *csdev) -{ - return -EINVAL; -} - -static inline void coresight_disclaim_device(struct coresight_device *csdev) {} -static inline void coresight_disclaim_device_unlocked(struct coresight_device *csdev) {} - -static inline bool coresight_loses_context_with_cpu(struct device *dev) -{ - return false; -} - -static inline u32 coresight_relaxed_read32(struct coresight_device *csdev, u32 offset) -{ - WARN_ON_ONCE(1); - return 0; -} - -static inline u32 coresight_read32(struct coresight_device *csdev, u32 offset) -{ - WARN_ON_ONCE(1); - return 0; -} - -static inline void coresight_write32(struct coresight_device *csdev, u32 val, u32 offset) -{ -} - -static inline void coresight_relaxed_write32(struct coresight_device *csdev, - u32 val, u32 offset) -{ -} - -static inline u64 coresight_relaxed_read64(struct coresight_device *csdev, - u32 offset) -{ - WARN_ON_ONCE(1); - return 0; -} - -static inline u64 coresight_read64(struct coresight_device *csdev, u32 offset) -{ - WARN_ON_ONCE(1); - return 0; -} - -static inline void coresight_relaxed_write64(struct coresight_device *csdev, - u64 val, u32 offset) -{ -} - -static inline void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset) -{ -} - -#endif /* IS_ENABLED(CONFIG_CORESIGHT) */ - extern int coresight_get_cpu(struct device *dev); struct coresight_platform_data *coresight_get_platform_data(struct device *dev); -- cgit v1.2.3 From d724f65218b994da234081df5dfe417c23802a65 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:41 +0000 Subject: coresight: Add helper for atomically taking the device Now that mode is in struct coresight_device, this pattern can be wrapped in a helper. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-11-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index ecf4b8aecca8..414bcbbdaf62 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -580,6 +580,17 @@ static inline bool coresight_is_percpu_sink(struct coresight_device *csdev) (csdev->subtype.sink_subtype == CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM); } +/* + * Atomically try to take the device and set a new mode. Returns true on + * success, false if the device is already taken by someone else. + */ +static inline bool coresight_take_mode(struct coresight_device *csdev, + enum cs_mode new_mode) +{ + return local_cmpxchg(&csdev->mode, CS_MODE_DISABLED, new_mode) == + CS_MODE_DISABLED; +} + extern struct coresight_device * coresight_register(struct coresight_desc *desc); extern void coresight_unregister(struct coresight_device *csdev); -- cgit v1.2.3 From c95c2733e5feb1f6848923f166849b2d1c7bf682 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:42 +0000 Subject: coresight: Add a helper for getting csdev->mode Now that mode is in struct coresight_device accesses can be wrapped. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-12-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 414bcbbdaf62..a49e4e20e899 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -591,6 +591,11 @@ static inline bool coresight_take_mode(struct coresight_device *csdev, CS_MODE_DISABLED; } +static inline enum cs_mode coresight_get_mode(struct coresight_device *csdev) +{ + return local_read(&csdev->mode); +} + extern struct coresight_device * coresight_register(struct coresight_desc *desc); extern void coresight_unregister(struct coresight_device *csdev); -- cgit v1.2.3 From bcaabb95f0c9883fb8e1112bd13eaba9cfd62c15 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 29 Jan 2024 15:40:43 +0000 Subject: coresight: Add helper for setting csdev->mode Now that mode is in struct coresight_device, sets can be wrapped. This also allows us to add a sanity check that there have been no concurrent modifications of mode. Currently all usages of local_set() were inside the device's spin locks so this new warning shouldn't be triggered. coresight_take_mode() could maybe have been used in place of adding the warning, but there may be use cases which set the mode to the same mode which are valid but would fail in coresight_take_mode() because it requires the device to only be in the disabled state. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240129154050.569566-13-james.clark@arm.com Signed-off-by: Suzuki K Poulose --- include/linux/coresight.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index a49e4e20e899..5f288d475490 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -596,6 +596,22 @@ static inline enum cs_mode coresight_get_mode(struct coresight_device *csdev) return local_read(&csdev->mode); } +static inline void coresight_set_mode(struct coresight_device *csdev, + enum cs_mode new_mode) +{ + enum cs_mode current_mode = coresight_get_mode(csdev); + + /* + * Changing to a new mode must be done from an already disabled state + * unless it's synchronized with coresight_take_mode(). Otherwise the + * device is already in use and signifies a locking issue. + */ + WARN(new_mode != CS_MODE_DISABLED && current_mode != CS_MODE_DISABLED && + current_mode != new_mode, "Device already in use\n"); + + local_set(&csdev->mode, new_mode); +} + extern struct coresight_device * coresight_register(struct coresight_desc *desc); extern void coresight_unregister(struct coresight_device *csdev); -- cgit v1.2.3 From ae8c511757304e0c393661b5ed2ad7073e2a351d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 21:56:19 -0500 Subject: fs: add FS_IOC_GETFSSYSFSPATH Add a new ioctl for getting the sysfs name of a filesystem - the path under /sys/fs. This is going to let us standardize exporting data from sysfs across filesystems, e.g. time stats. The returned path will always be of the form "$FSTYP/$SYSFS_IDENTIFIER", where the sysfs identifier may be a UUID (for bcachefs) or a device name (xfs). Cc: Christian Brauner Cc: Jan Kara Cc: Dave Chinner Cc: Darrick J. Wong Cc: Theodore Ts'o Cc: Josef Bacik Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20240207025624.1019754-6-kent.overstreet@linux.dev Signed-off-by: Christian Brauner --- include/linux/fs.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index acdc56987cb1..c6d9e1b7032c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1255,10 +1255,23 @@ struct super_block { struct fsnotify_mark_connector __rcu *s_fsnotify_marks; #endif + /* + * q: why are s_id and s_sysfs_name not the same? both are human + * readable strings that identify the filesystem + * a: s_id is allowed to change at runtime; it's used in log messages, + * and we want to when a device starts out as single device (s_id is dev + * name) but then a device is hot added and we have to switch to + * identifying it by UUID + * but s_sysfs_name is a handle for programmatic access, and can't + * change at runtime + */ char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ + /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ + char s_sysfs_name[UUID_STRING_LEN + 1]; + unsigned int s_max_links; /* @@ -2541,6 +2554,36 @@ static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsign memcpy(&sb->s_uuid, uuid, len); } +/* set sb sysfs name based on sb->s_bdev */ +static inline void super_set_sysfs_name_bdev(struct super_block *sb) +{ + snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev); +} + +/* set sb sysfs name based on sb->s_uuid */ +static inline void super_set_sysfs_name_uuid(struct super_block *sb) +{ + WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid)); + snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b); +} + +/* set sb sysfs name based on sb->s_id */ +static inline void super_set_sysfs_name_id(struct super_block *sb) +{ + strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name)); +} + +/* try to use something standard before you use this */ +__printf(2, 3) +static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args); + va_end(args); +} + extern int current_umask(void); extern void ihold(struct inode * inode); -- cgit v1.2.3 From 582a3bf999df662c0e0fa4717ce7559f16d7b93b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 12 Feb 2024 07:24:05 -0500 Subject: filelock: always define for_each_file_lock() ...and eliminate the stub version when CONFIG_FILE_LOCKING is disabled. This silences the following warning that crept in recently: fs/ceph/locks.c: In function 'ceph_count_locks': fs/ceph/locks.c:380:27: error: unused variable 'lock' [-Werror=unused-variable] 380 | struct file_lock *lock; Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402062210.3YyBVGF1-lkp@intel.com/ Fixes: 75cabec0111b ("filelock: add some new helper functions") Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240212-flsplit3-v1-1-019f0ad6bf69@kernel.org Signed-off-by: Christian Brauner --- include/linux/filelock.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index aabd4bdf7eba..daee999d05f3 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -180,9 +180,6 @@ static inline void locks_wake_up(struct file_lock *fl) wake_up(&fl->c.flc_wait); } -/* for walking lists of file_locks linked by fl_list */ -#define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, c.flc_list) - /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); @@ -283,8 +280,6 @@ static inline void locks_wake_up(struct file_lock *fl) { } -#define for_each_file_lock(_fl, _head) while(false) - static inline void locks_free_lock_context(struct inode *inode) { @@ -414,6 +409,9 @@ locks_inode_context(const struct inode *inode) #endif /* !CONFIG_FILE_LOCKING */ +/* for walking lists of file_locks linked by fl_list */ +#define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, c.flc_list) + static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { return locks_lock_inode_wait(file_inode(filp), fl); -- cgit v1.2.3 From 71f4ecdbb42addf82b01b734b122a02707fed521 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Sun, 28 Jan 2024 23:52:20 -0800 Subject: block: remove gfp_flags from blkdev_zone_mgmt Now that all callers pass in GFP_KERNEL to blkdev_zone_mgmt() and use memalloc_no{io,fs}_{save,restore}() to define the allocation scope, we can drop the gfp_mask parameter from blkdev_zone_mgmt() as well as blkdev_zone_reset_all() and blkdev_zone_reset_all_emulated(). Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240128-zonefs_nofs-v3-5-ae3b7c8def61@wdc.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d7cac3de65b3..fac580976e3a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -325,7 +325,7 @@ void disk_set_zoned(struct gendisk *disk); int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); + sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)); -- cgit v1.2.3 From 60d21aac52e26531affdadb7543fe5b93f58b450 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 1 Feb 2024 18:31:25 +0530 Subject: block: support PI at non-zero offset within metadata Block layer integrity processing assumes that protection information (PI) is placed in the first bytes of each metadata block. Remove this limitation and include the metadata before the PI in the calculation of the guard tag. Signed-off-by: Kanchan Joshi Signed-off-by: Chinmay Gameti Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240201130126.211402-3-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 1 + include/linux/blkdev.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 378b2459efe2..e253e7bd0d17 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -20,6 +20,7 @@ struct blk_integrity_iter { unsigned int data_size; unsigned short interval; unsigned char tuple_size; + unsigned char pi_offset; const char *disk_name; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fac580976e3a..0058783a4c43 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -108,6 +108,7 @@ struct blk_integrity { const struct blk_integrity_profile *profile; unsigned char flags; unsigned char tuple_size; + unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; }; -- cgit v1.2.3 From c0ef3df8dbaef51ee4cfd58a471adf2eaee6f6b3 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 30 Jan 2024 13:28:05 +0200 Subject: PM: runtime: Simplify pm_runtime_get_if_active() usage There are two ways to opportunistically increment a device's runtime PM usage count, calling either pm_runtime_get_if_active() or pm_runtime_get_if_in_use(). The former has an argument to tell whether to ignore the usage count or not, and the latter simply calls the former with ign_usage_count set to false. The other users that want to ignore the usage_count will have to explicitly set that argument to true which is a bit cumbersome. To make this function more practical to use, remove the ign_usage_count argument from the function. The main implementation is in a static function called pm_runtime_get_conditional() and implementations of pm_runtime_get_if_active() and pm_runtime_get_if_in_use() are moved to runtime.c. Signed-off-by: Sakari Ailus Reviewed-by: Alex Elder Reviewed-by: Laurent Pinchart Acked-by: Takashi Iwai # sound/ Reviewed-by: Jacek Lawrynowicz # drivers/accel/ivpu/ Acked-by: Rodrigo Vivi # drivers/gpu/drm/i915/ Reviewed-by: Rodrigo Vivi Acked-by: Bjorn Helgaas # drivers/pci/ Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 7c9b35448563..436baa167498 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -72,7 +72,8 @@ extern int pm_runtime_force_resume(struct device *dev); extern int __pm_runtime_idle(struct device *dev, int rpmflags); extern int __pm_runtime_suspend(struct device *dev, int rpmflags); extern int __pm_runtime_resume(struct device *dev, int rpmflags); -extern int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count); +extern int pm_runtime_get_if_active(struct device *dev); +extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); @@ -94,18 +95,6 @@ extern void pm_runtime_release_supplier(struct device_link *link); extern int devm_pm_runtime_enable(struct device *dev); -/** - * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. - * @dev: Target device. - * - * Increment the runtime PM usage counter of @dev if its runtime PM status is - * %RPM_ACTIVE and its runtime PM usage counter is greater than 0. - */ -static inline int pm_runtime_get_if_in_use(struct device *dev) -{ - return pm_runtime_get_if_active(dev, false); -} - /** * pm_suspend_ignore_children - Set runtime PM behavior regarding children. * @dev: Target device. @@ -275,8 +264,7 @@ static inline int pm_runtime_get_if_in_use(struct device *dev) { return -EINVAL; } -static inline int pm_runtime_get_if_active(struct device *dev, - bool ign_usage_count) +static inline int pm_runtime_get_if_active(struct device *dev) { return -EINVAL; } -- cgit v1.2.3 From b7d46644e554ed017dfabc0841acf418d0584bc9 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 30 Jan 2024 13:28:32 +0200 Subject: PM: runtime: Add pm_runtime_put_autosuspend() replacement Add __pm_runtime_put_autosuspend() that replaces pm_runtime_put_autosuspend() for new users. The intent is to later re-purpose pm_runtime_put_autosuspend() to also mark the device's last busy stamp---which is what the vast majority of users actually need. This is also described in pm_runtime_put_autosuspend() documentation. Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 436baa167498..d39dc863f612 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -448,6 +448,18 @@ static inline int pm_runtime_put(struct device *dev) return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } +/** + * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. + * @dev: Target device. + * + * Decrement the runtime PM usage counter of @dev and if it turns out to be + * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). + */ +static inline int __pm_runtime_put_autosuspend(struct device *dev) +{ + return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); +} + /** * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. -- cgit v1.2.3 From 7251b9e8a007ddd834aa81f8c7ea338884629fec Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 6 Feb 2024 09:54:09 +0800 Subject: thermal/intel: Fix intel_tcc_get_temp() to support negative CPU temperature CPU temperature can be negative in some cases. Thus the negative CPU temperature should not be considered as a failure. Fix intel_tcc_get_temp() and its users to support negative CPU temperature. Fixes: a3c1f066e1c5 ("thermal/intel: Introduce Intel TCC library") Signed-off-by: Zhang Rui Reviewed-by: Stanislaw Gruszka Cc: 6.3+ # 6.3+ Signed-off-by: Rafael J. Wysocki --- include/linux/intel_tcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel_tcc.h b/include/linux/intel_tcc.h index f422612c28d6..8ff8eabb4a98 100644 --- a/include/linux/intel_tcc.h +++ b/include/linux/intel_tcc.h @@ -13,6 +13,6 @@ int intel_tcc_get_tjmax(int cpu); int intel_tcc_get_offset(int cpu); int intel_tcc_set_offset(int cpu, int offset); -int intel_tcc_get_temp(int cpu, bool pkg); +int intel_tcc_get_temp(int cpu, int *temp, bool pkg); #endif /* __INTEL_TCC_H__ */ -- cgit v1.2.3 From 11ba1728be3edb6928791f4c622f154ebe228ae6 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 2 Feb 2024 12:30:26 +0000 Subject: ptrace: Introduce exception_ip arch hook On architectures with delay slot, architecture level instruction pointer (or program counter) in pt_regs may differ from where exception was triggered. Introduce exception_ip hook to invoke architecture code and determine actual instruction pointer to the exception. Link: https://lore.kernel.org/lkml/00d1b813-c55f-4365-8d81-d70258e10b16@app.fastmail.com/ Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- include/linux/ptrace.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index eaaef3ffec22..90507d4afcd6 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -393,6 +393,10 @@ static inline void user_single_step_report(struct pt_regs *regs) #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif +#ifndef exception_ip +#define exception_ip(x) instruction_pointer(x) +#endif + extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); -- cgit v1.2.3 From 6ac86372102b477083db99a9af8246fb916271b5 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 25 Jan 2024 09:15:59 +0100 Subject: gpiolib: add gpiod_to_gpio_device() stub for !GPIOLIB Add empty stub of gpiod_to_gpio_device() when GPIOLIB is not enabled. Cc: Fixes: 370232d096e3 ("gpiolib: provide gpiod_to_gpio_device()") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9a5c6c76e653..012797e7106d 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -819,6 +819,12 @@ static inline struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc) return ERR_PTR(-ENODEV); } +static inline struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc) +{ + WARN_ON(1); + return ERR_PTR(-ENODEV); +} + static inline int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset) { -- cgit v1.2.3 From ebe0c15b135b1e4092c25b95d89e9a5899467499 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 25 Jan 2024 09:16:00 +0100 Subject: gpiolib: add gpio_device_get_base() stub for !GPIOLIB Add empty stub of gpio_device_get_base() when GPIOLIB is not enabled. Cc: Fixes: 8c85a102fc4e ("gpiolib: provide gpio_device_get_base()") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 012797e7106d..c1df7698edb0 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -825,6 +825,12 @@ static inline struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc) return ERR_PTR(-ENODEV); } +static inline int gpio_device_get_base(struct gpio_device *gdev) +{ + WARN_ON(1); + return -ENODEV; +} + static inline int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset) { -- cgit v1.2.3 From 2df8aa3cad407044f2febdbbdf220c6dae839c79 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 25 Jan 2024 09:16:01 +0100 Subject: gpiolib: add gpio_device_get_label() stub for !GPIOLIB Add empty stub of gpio_device_get_label() when GPIOLIB is not enabled. Cc: Fixes: d1f7728259ef ("gpiolib: provide gpio_device_get_label()") Suggested-by: kernel test robot Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index c1df7698edb0..7f75c9a51874 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -831,6 +831,12 @@ static inline int gpio_device_get_base(struct gpio_device *gdev) return -ENODEV; } +static inline const char *gpio_device_get_label(struct gpio_device *gdev) +{ + WARN_ON(1); + return NULL; +} + static inline int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset) { -- cgit v1.2.3 From ed94a8f8ca75ea0f607c919edf2ed5a5e707ef44 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 1 Feb 2024 12:53:16 +0100 Subject: HID: protect hid_device::bpf by CONFIG_HID_BPF And not by CONFIG_BPF. BPF can be selected while HID_BPF does not have to. It actually cannot be on some platforms due to Kconfig dependences. This saves quite some bytes on those setups. Found by https://github.com/jirislaby/clang-struct. Signed-off-by: Jiri Slaby (SUSE) Cc: Jiri Kosina Cc: Benjamin Tissoires Signed-off-by: Jiri Kosina --- include/linux/hid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 7c26db874ff0..b12cb1c8e682 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -683,9 +683,9 @@ struct hid_device { /* device report descriptor */ unsigned int id; /* system unique id */ -#ifdef CONFIG_BPF +#ifdef CONFIG_HID_BPF struct hid_bpf bpf; /* hid-bpf data */ -#endif /* CONFIG_BPF */ +#endif /* CONFIG_HID_BPF */ }; void hiddev_free(struct kref *ref); -- cgit v1.2.3 From 8c4955c069ea3b77dc63b55d13afa9341e894849 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:11 +0100 Subject: block: move max_{open,active}_zones to struct queue_limits The maximum number of open and active zones is a limit on the queue and should be places there so that we can including it in the upcoming queue limits batch update API. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0058783a4c43..251a11d2d2ae 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -190,8 +190,6 @@ struct gendisk { * blk_mq_unfreeze_queue(). */ unsigned int nr_zones; - unsigned int max_open_zones; - unsigned int max_active_zones; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; #endif /* CONFIG_BLK_DEV_ZONED */ @@ -308,6 +306,8 @@ struct queue_limits { unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; bool zoned; + unsigned int max_open_zones; + unsigned int max_active_zones; /* * Drivers that set dma_alignment to less than 511 must be prepared to @@ -640,23 +640,23 @@ static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) static inline void disk_set_max_open_zones(struct gendisk *disk, unsigned int max_open_zones) { - disk->max_open_zones = max_open_zones; + disk->queue->limits.max_open_zones = max_open_zones; } static inline void disk_set_max_active_zones(struct gendisk *disk, unsigned int max_active_zones) { - disk->max_active_zones = max_active_zones; + disk->queue->limits.max_active_zones = max_active_zones; } static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { - return bdev->bd_disk->max_open_zones; + return bdev->bd_disk->queue->limits.max_open_zones; } static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { - return bdev->bd_disk->max_active_zones; + return bdev->bd_disk->queue->limits.max_active_zones; } #else /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From d690cb8ae14bd377d422b7905b6959c7e7a45b95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:14 +0100 Subject: block: add an API to atomically update queue limits Add a new queue_limits_{start,commit}_update pair of functions that allows taking an atomic snapshot of queue limits, update it, and commit it if it passes validity checking. Also use the low-level validation helper to implement blk_set_default_limits instead of duplicating the initialization. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 251a11d2d2ae..d41d7fe93457 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -474,6 +474,7 @@ struct request_queue { struct mutex sysfs_lock; struct mutex sysfs_dir_lock; + struct mutex limits_lock; /* * for reusing dead hctx instance in case of updating @@ -862,6 +863,28 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset, return chunk_sectors - (offset & (chunk_sectors - 1)); } +/** + * queue_limits_start_update - start an atomic update of queue limits + * @q: queue to update + * + * This functions starts an atomic update of the queue limits. It takes a lock + * to prevent other updates and returns a snapshot of the current limits that + * the caller can modify. The caller must call queue_limits_commit_update() + * to finish the update. + * + * Context: process context. The caller must have frozen the queue or ensured + * that there is outstanding I/O by other means. + */ +static inline struct queue_limits +queue_limits_start_update(struct request_queue *q) + __acquires(q->limits_lock) +{ + mutex_lock(&q->limits_lock); + return q->limits; +} +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim); + /* * Access functions for manipulating queue properties */ -- cgit v1.2.3 From 4f563a64732dabb2677c7d1232a8f714a18b41b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:16 +0100 Subject: block: add a max_user_discard_sectors queue limit Add a new max_user_discard_sectors limit that mirrors max_user_sectors and stores the value that the user manually set. This now allows updates of the max_hw_discard_sectors to not worry about the user limit. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d41d7fe93457..45746ba73670 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -291,6 +291,7 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; + unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; -- cgit v1.2.3 From 9ac4dd8c47d533eb420af6a679e66ec74771125c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:19 +0100 Subject: block: pass a queue_limits argument to blk_mq_init_queue Pass a queue_limits to blk_mq_init_queue and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Also rename the function to blk_mq_alloc_queue as that is a much better name for a function that allocates a queue and always pass the queuedata argument instead of having a separate version for the extra argument. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-10-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..7d42c359e2ab 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -692,7 +692,8 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_destroy_queue(struct request_queue *); -- cgit v1.2.3 From 27e32cd23fed1ab88098897897dcb9ec2bdba4de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:20 +0100 Subject: block: pass a queue_limits argument to blk_mq_alloc_disk Pass a queue_limits to blk_mq_alloc_disk and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7d42c359e2ab..390d35fa0032 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -682,13 +682,14 @@ enum { #define BLK_MQ_NO_HCTX_IDX (-1U) -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); -#define blk_mq_alloc_disk(set, queuedata) \ +#define blk_mq_alloc_disk(set, lim, queuedata) \ ({ \ static struct lock_class_key __key; \ \ - __blk_mq_alloc_disk(set, queuedata, &__key); \ + __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -- cgit v1.2.3 From 1aa09b9379a7a644cd2f75ae0bac82b8783df600 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 31 Jan 2024 19:37:09 +0800 Subject: powercap: intel_rapl: Fix locking in TPMI RAPL The RAPL framework uses CPU hotplug locking to protect the rapl_packages list and rp->lead_cpu to guarantee that 1. the RAPL package device is not unprobed and freed 2. the cached rp->lead_cpu is always valid for operations like powercap sysfs accesses. Current RAPL APIs assume being called from CPU hotplug callbacks which hold the CPU hotplug lock, but TPMI RAPL driver invokes the APIs in the driver's .probe() function without acquiring the CPU hotplug lock. Fix the problem by providing both locked and lockless versions of RAPL APIs. Fixes: 9eef7f9da928 ("powercap: intel_rapl: Introduce RAPL TPMI interface driver") Signed-off-by: Zhang Rui Cc: 6.5+ # 6.5+ Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 33f21bd85dbf..f3196f82fd8a 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -178,6 +178,12 @@ struct rapl_package { struct rapl_if_priv *priv; }; +struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv, + bool id_is_cpu); +struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, + bool id_is_cpu); +void rapl_remove_package_cpuslocked(struct rapl_package *rp); + struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); -- cgit v1.2.3 From 29f6975332479f92233594901c649ff4d71f8cb6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 5 Feb 2024 11:10:25 -0800 Subject: nvme: implement support for relaxed effects NVM Express TP4167 provides a way for controllers to report a relaxed execution constraint. Specifically, it notifies of exclusivity for IO vs. admin commands instead of grouping these together. If set, then we don't need to freeze IO in order to execute that admin command. The freezing distrupts IO processes, so it's nice to avoid that if the controller tells us it's not necessary. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bc605ec4a3fd..3ef4053ea950 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -646,6 +646,7 @@ enum { NVME_CMD_EFFECTS_NCC = 1 << 2, NVME_CMD_EFFECTS_NIC = 1 << 3, NVME_CMD_EFFECTS_CCC = 1 << 4, + NVME_CMD_EFFECTS_CSER_MASK = GENMASK(15, 14), NVME_CMD_EFFECTS_CSE_MASK = GENMASK(18, 16), NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, NVME_CMD_EFFECTS_SCOPE_MASK = GENMASK(31, 20), -- cgit v1.2.3 From 178c54666f9c4d2f49f2ea661d0c11b52f0ed190 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 6 Feb 2024 23:01:02 -0800 Subject: bpf: Mark bpf_spin_{lock,unlock}() helpers with notrace correctly Currently tracing is supposed not to allow for bpf_spin_{lock,unlock}() helper calls. This is to prevent deadlock for the following cases: - there is a prog (prog-A) calling bpf_spin_{lock,unlock}(). - there is a tracing program (prog-B), e.g., fentry, attached to bpf_spin_lock() and/or bpf_spin_unlock(). - prog-B calls bpf_spin_{lock,unlock}(). For such a case, when prog-A calls bpf_spin_{lock,unlock}(), a deadlock will happen. The related source codes are below in kernel/bpf/helpers.c: notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) notrace is supposed to prevent fentry prog from attaching to bpf_spin_{lock,unlock}(). But actually this is not the case and fentry prog can successfully attached to bpf_spin_lock(). Siddharth Chintamaneni reported the issue in [1]. The following is the macro definition for above BPF_CALL_1: #define BPF_CALL_x(x, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) #define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) The notrace attribute is actually applied to the static always_inline function ____bpf_spin_{lock,unlock}(). The actual callback function bpf_spin_{lock,unlock}() is not marked with notrace, hence allowing fentry prog to attach to two helpers, and this may cause the above mentioned deadlock. Siddharth Chintamaneni actually has a reproducer in [2]. To fix the issue, a new macro NOTRACE_BPF_CALL_1 is introduced which will add notrace attribute to the original function instead of the hidden always_inline function and this fixed the problem. [1] https://lore.kernel.org/bpf/CAE5sdEigPnoGrzN8WU7Tx-h-iFuMZgW06qp0KHWtpvoXxf1OAQ@mail.gmail.com/ [2] https://lore.kernel.org/bpf/CAE5sdEg6yUc_Jz50AnUXEEUh6O73yQ1Z6NV2srJnef0ZrQkZew@mail.gmail.com/ Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240207070102.335167-1-yonghong.song@linux.dev --- include/linux/filter.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index fee070b9826e..36cc29a2934c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -547,24 +547,27 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ u64, __ur_3, u64, __ur_4, u64, __ur_5) -#define BPF_CALL_x(x, name, ...) \ +#define BPF_CALL_x(x, attr, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ - u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ - u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ + attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ + attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) -#define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) -#define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) -#define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) -#define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) -#define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) -#define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) +#define __NOATTR +#define BPF_CALL_0(name, ...) BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_1(name, ...) BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_2(name, ...) BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_3(name, ...) BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_4(name, ...) BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__) +#define BPF_CALL_5(name, ...) BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__) + +#define NOTRACE_BPF_CALL_1(name, ...) BPF_CALL_x(1, notrace, name, __VA_ARGS__) #define bpf_ctx_range(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 -- cgit v1.2.3 From 77c0208e199ccb0986fb3612f2409c8cdcb036ad Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 8 Feb 2024 18:37:47 -0800 Subject: bpf: add btf pointer to struct bpf_ctx_arg_aux. Enable the providers to use types defined in a module instead of in the kernel (btf_vmlinux). Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240209023750.1153905-2-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1ebbee1d648e..3b7836f0a83e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1415,6 +1415,7 @@ struct bpf_jit_poke_descriptor { struct bpf_ctx_arg_aux { u32 offset; enum bpf_reg_type reg_type; + struct btf *btf; u32 btf_id; }; -- cgit v1.2.3 From 6115a0aeef01aef152ad7738393aad11422bfb82 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 8 Feb 2024 18:37:48 -0800 Subject: bpf: Move __kfunc_param_match_suffix() to btf.c. Move __kfunc_param_match_suffix() to btf.c and rename it as btf_param_match_suffix(). It can be reused by bpf_struct_ops later. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240209023750.1153905-3-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/btf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 1ee8977b8c95..df76a14c64f6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -495,6 +495,10 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); } +bool btf_param_match_suffix(const struct btf *btf, + const struct btf_param *arg, + const char *suffix); + struct bpf_verifier_log; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) -- cgit v1.2.3 From 1611603537a4b88cec7993f32b70c03113801a46 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Thu, 8 Feb 2024 18:37:49 -0800 Subject: bpf: Create argument information for nullable arguments. Collect argument information from the type information of stub functions to mark arguments of BPF struct_ops programs with PTR_MAYBE_NULL if they are nullable. A nullable argument is annotated by suffixing "__nullable" at the argument name of stub function. For nullable arguments, this patch sets a struct bpf_ctx_arg_aux to label their reg_type with PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL. This makes the verifier to check programs and ensure that they properly check the pointer. The programs should check if the pointer is null before accessing the pointed memory. The implementer of a struct_ops type should annotate the arguments that can be null. The implementer should define a stub function (empty) as a placeholder for each defined operator. The name of a stub function should be in the pattern "__". For example, for test_maybe_null of struct bpf_testmod_ops, it's stub function name should be "bpf_testmod_ops__test_maybe_null". You mark an argument nullable by suffixing the argument name with "__nullable" at the stub function. Since we already has stub functions for kCFI, we just reuse these stub functions with the naming convention mentioned earlier. These stub functions with the naming convention is only required if there are nullable arguments to annotate. For functions having not nullable arguments, stub functions are not necessary for the purpose of this patch. This patch will prepare a list of struct bpf_ctx_arg_aux, aka arg_info, for each member field of a struct_ops type. "arg_info" will be assigned to "prog->aux->ctx_arg_info" of BPF struct_ops programs in check_struct_ops_btf_id() so that it can be used by btf_ctx_access() later to set reg_type properly for the verifier. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240209023750.1153905-4-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 21 +++++++++++++++++++++ include/linux/btf.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b7836f0a83e..c7aa99b44dbd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1709,6 +1709,19 @@ struct bpf_struct_ops { struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; }; +/* Every member of a struct_ops type has an instance even a member is not + * an operator (function pointer). The "info" field will be assigned to + * prog->aux->ctx_arg_info of BPF struct_ops programs to provide the + * argument information required by the verifier to verify the program. + * + * btf_ctx_access() will lookup prog->aux->ctx_arg_info to find the + * corresponding entry for an given argument. + */ +struct bpf_struct_ops_arg_info { + struct bpf_ctx_arg_aux *info; + u32 cnt; +}; + struct bpf_struct_ops_desc { struct bpf_struct_ops *st_ops; @@ -1716,6 +1729,9 @@ struct bpf_struct_ops_desc { const struct btf_type *value_type; u32 type_id; u32 value_id; + + /* Collection of argument information for each member */ + struct bpf_struct_ops_arg_info *arg_info; }; enum bpf_struct_ops_state { @@ -1790,6 +1806,7 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log); void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map); +void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc); #else #define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; }) static inline bool bpf_try_module_get(const void *data, struct module *owner) @@ -1814,6 +1831,10 @@ static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struc { } +static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) +{ +} + #endif #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) diff --git a/include/linux/btf.h b/include/linux/btf.h index df76a14c64f6..cb96f6263638 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -498,6 +498,8 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) bool btf_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix); +int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, + u32 arg_no); struct bpf_verifier_log; -- cgit v1.2.3 From 32e18e7688c6847b0c9db073aafb00639ecf576c Mon Sep 17 00:00:00 2001 From: Oliver Crumrine Date: Fri, 9 Feb 2024 14:41:22 -0500 Subject: bpf: remove check in __cgroup_bpf_run_filter_skb Originally, this patch removed a redundant check in BPF_CGROUP_RUN_PROG_INET_EGRESS, as the check was already being done in the function it called, __cgroup_bpf_run_filter_skb. For v2, it was reccomended that I remove the check from __cgroup_bpf_run_filter_skb, and add the checks to the other macro that calls that function, BPF_CGROUP_RUN_PROG_INET_INGRESS. To sum it up, checking that the socket exists and that it is a full socket is now part of both macros BPF_CGROUP_RUN_PROG_INET_EGRESS and BPF_CGROUP_RUN_PROG_INET_INGRESS, and it is no longer part of the function they call, __cgroup_bpf_run_filter_skb. v3->v4: Fixed weird merge conflict. v2->v3: Sent to bpf-next instead of generic patch v1->v2: Addressed feedback about where check should be removed. Signed-off-by: Oliver Crumrine Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/7lv62yiyvmj5a7eozv2iznglpkydkdfancgmbhiptrgvgan5sy@3fl3onchgdz3 Signed-off-by: Martin KaFai Lau --- include/linux/bpf-cgroup.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a789266feac3..fb3c3e7181e6 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -196,7 +196,8 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \ - cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS)) \ + cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS) && sk && \ + sk_fullsock(sk)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ -- cgit v1.2.3 From fb5b86cfd4ef21ea18966718f6bf6c8f1b9df12e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Feb 2024 15:32:18 -0800 Subject: bpf: simplify btf_get_prog_ctx_type() into btf_is_prog_ctx_type() Return result of btf_get_prog_ctx_type() is never used and callers only check NULL vs non-NULL case to determine if given type matches expected PTR_TO_CTX type. So rename function to `btf_is_prog_ctx_type()` and return a simple true/false. We'll use this simpler interface to handle kprobe program type's special typedef case in the next patch. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240212233221.2575350-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index cb96f6263638..f9e56fd12a9f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -531,10 +531,9 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); -const struct btf_type * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg); +bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg); int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); @@ -574,12 +573,12 @@ static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf { return NULL; } -static inline const struct btf_member * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg) +static inline bool +btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) { - return NULL; + return false; } static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) { -- cgit v1.2.3 From 4d2bb0bfe8741a8778e0053f31a4e0f0cba80e8b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 12 Feb 2024 10:50:55 +0100 Subject: xdp: rely on skb pointer reference in do_xdp_generic and netif_receive_generic_xdp Rely on skb pointer reference instead of the skb pointer in do_xdp_generic and netif_receive_generic_xdp routine signatures. This is a preliminary patch to add multi-buff support for xdp running in generic mode where we will need to reallocate the skb to avoid linearization and we will need to make it visible to do_xdp_generic() caller. Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Hoiland-Jorgensen Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/c09415b1f48c8620ef4d76deed35050a7bddf7c2.1707729884.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 07cefa32eafa..a3f9c95da51e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3958,7 +3958,7 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog); void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); -- cgit v1.2.3 From e6d5dbdd20aa6a86974af51deb9414cd2e7794cb Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 12 Feb 2024 10:50:56 +0100 Subject: xdp: add multi-buff support for xdp running in generic mode Similar to native xdp, do not always linearize the skb in netif_receive_generic_xdp routine but create a non-linear xdp_buff to be processed by the eBPF program. This allow to add multi-buffer support for xdp running in generic mode. Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Hoiland-Jorgensen Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/1044d6412b1c3e95b40d34993fd5f37cd2f319fd.1707729884.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2dde34c29203..def3d8689c3d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3446,6 +3446,8 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, + struct bpf_prog *prog); bool napi_pp_put_page(struct page *page, bool napi_safe); static inline void -- cgit v1.2.3 From 27accb3cc08a0ec4e348356774042d5fa5f30cce Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 12 Feb 2024 10:50:57 +0100 Subject: veth: rely on skb_pp_cow_data utility routine Rely on skb_pp_cow_data utility routine and remove duplicated code. Acked-by: Jesper Dangaard Brouer Reviewed-by: Toke Hoiland-Jorgensen Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/029cc14cce41cb242ee7efdcf32acc81f1ce4e9f.1707729884.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index def3d8689c3d..696e7680656f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3446,6 +3446,8 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, + unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, struct bpf_prog *prog); bool napi_pp_put_page(struct page *page, bool napi_safe); -- cgit v1.2.3 From d2e8899de71cd0a3c22a0eadfb9d54604d34eb96 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 13 Feb 2024 11:44:01 -0300 Subject: soc: qcom: apr: make aprbus const Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the aprbus variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: "Ricardo B. Marliere" Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240213-bus_cleanup-apr-v1-1-50c824eec06d@marliere.net Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/apr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h index be98aebcb3e1..7161a3183eda 100644 --- a/include/linux/soc/qcom/apr.h +++ b/include/linux/soc/qcom/apr.h @@ -9,7 +9,7 @@ #include #include -extern struct bus_type aprbus; +extern const struct bus_type aprbus; #define APR_HDR_LEN(hdr_len) ((hdr_len)/4) -- cgit v1.2.3 From 161e16a5e50a82d219b3df3ce32286b0a2ae08bd Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 30 Jan 2024 13:39:47 +0100 Subject: PM: domains: Add helper functions to attach/detach multiple PM domains Attaching/detaching of a device to multiple PM domains has started to become a common operation for many drivers, typically during ->probe() and ->remove(). In most cases, this has lead to lots of boilerplate code in the drivers. To fixup up the situation, let's introduce a pair of helper functions, dev_pm_domain_attach|detach_list(), that driver can use instead of the open-coding. Note that, it seems reasonable to limit the support for these helpers to DT based platforms, at it's the only valid use case for now. Suggested-by: Daniel Baluta Tested-by: Bryan O'Donoghue Tested-by: Iuliana Prodan Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20240130123951.236243-2-ulf.hansson@linaro.org --- include/linux/pm_domain.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index b97c5e9820f9..fb91770ba4ba 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -19,6 +19,33 @@ #include #include +/* + * Flags to control the behaviour when attaching a device to its PM domains. + * + * PD_FLAG_NO_DEV_LINK: As the default behaviour creates a device-link + * for every PM domain that gets attached, this + * flag can be used to skip that. + * + * PD_FLAG_DEV_LINK_ON: Add the DL_FLAG_RPM_ACTIVE to power-on the + * supplier and its PM domain when creating the + * device-links. + * + */ +#define PD_FLAG_NO_DEV_LINK BIT(0) +#define PD_FLAG_DEV_LINK_ON BIT(1) + +struct dev_pm_domain_attach_data { + const char * const *pd_names; + const u32 num_pd_names; + const u32 pd_flags; +}; + +struct dev_pm_domain_list { + struct device **pd_devs; + struct device_link **pd_links; + u32 num_pds; +}; + /* * Flags to control the behaviour of a genpd. * @@ -420,7 +447,11 @@ struct device *dev_pm_domain_attach_by_id(struct device *dev, unsigned int index); struct device *dev_pm_domain_attach_by_name(struct device *dev, const char *name); +int dev_pm_domain_attach_list(struct device *dev, + const struct dev_pm_domain_attach_data *data, + struct dev_pm_domain_list **list); void dev_pm_domain_detach(struct device *dev, bool power_off); +void dev_pm_domain_detach_list(struct dev_pm_domain_list *list); int dev_pm_domain_start(struct device *dev); void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd); int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state); @@ -439,7 +470,14 @@ static inline struct device *dev_pm_domain_attach_by_name(struct device *dev, { return NULL; } +static inline int dev_pm_domain_attach_list(struct device *dev, + const struct dev_pm_domain_attach_data *data, + struct dev_pm_domain_list **list) +{ + return 0; +} static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {} +static inline void dev_pm_domain_detach_list(struct dev_pm_domain_list *list) {} static inline int dev_pm_domain_start(struct device *dev) { return 0; -- cgit v1.2.3 From 4d0824608a636b64373e52d3ef1516a86048e0e7 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Feb 2024 21:28:21 +0100 Subject: pmdomain: core: constify of_phandle_args in xlate The xlate callbacks are supposed to translate of_phandle_args to proper provider without modifying the of_phandle_args. Make the argument pointer to const for code safety and readability. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240208202822.631449-1-krzysztof.kozlowski@linaro.org Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index fb91770ba4ba..1a391ef1b6f8 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -349,7 +349,7 @@ static inline void dev_pm_genpd_resume(struct device *dev) {} /* OF PM domain providers */ struct of_device_id; -typedef struct generic_pm_domain *(*genpd_xlate_t)(struct of_phandle_args *args, +typedef struct generic_pm_domain *(*genpd_xlate_t)(const struct of_phandle_args *args, void *data); struct genpd_onecell_data { -- cgit v1.2.3 From 4af6bc163c4d841cf60bc9bf6e98603a232889b6 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Feb 2024 21:28:22 +0100 Subject: pmdomain: core: constify of_phandle_args in add device and subdomain Pointer to of_phandle_args is not modified by of_genpd_add_device() and of_genpd_add_subdomain(), so it can be made pointer to const for code safety and readability. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240208202822.631449-2-krzysztof.kozlowski@linaro.org Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 1a391ef1b6f8..772d3280d35f 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -364,11 +364,11 @@ int of_genpd_add_provider_simple(struct device_node *np, int of_genpd_add_provider_onecell(struct device_node *np, struct genpd_onecell_data *data); void of_genpd_del_provider(struct device_node *np); -int of_genpd_add_device(struct of_phandle_args *args, struct device *dev); -int of_genpd_add_subdomain(struct of_phandle_args *parent_spec, - struct of_phandle_args *subdomain_spec); -int of_genpd_remove_subdomain(struct of_phandle_args *parent_spec, - struct of_phandle_args *subdomain_spec); +int of_genpd_add_device(const struct of_phandle_args *args, struct device *dev); +int of_genpd_add_subdomain(const struct of_phandle_args *parent_spec, + const struct of_phandle_args *subdomain_spec); +int of_genpd_remove_subdomain(const struct of_phandle_args *parent_spec, + const struct of_phandle_args *subdomain_spec); struct generic_pm_domain *of_genpd_remove_last(struct device_node *np); int of_genpd_parse_idle_states(struct device_node *dn, struct genpd_power_state **states, int *n); @@ -393,20 +393,20 @@ static inline int of_genpd_add_provider_onecell(struct device_node *np, static inline void of_genpd_del_provider(struct device_node *np) {} -static inline int of_genpd_add_device(struct of_phandle_args *args, +static inline int of_genpd_add_device(const struct of_phandle_args *args, struct device *dev) { return -ENODEV; } -static inline int of_genpd_add_subdomain(struct of_phandle_args *parent_spec, - struct of_phandle_args *subdomain_spec) +static inline int of_genpd_add_subdomain(const struct of_phandle_args *parent_spec, + const struct of_phandle_args *subdomain_spec) { return -ENODEV; } -static inline int of_genpd_remove_subdomain(struct of_phandle_args *parent_spec, - struct of_phandle_args *subdomain_spec) +static inline int of_genpd_remove_subdomain(const struct of_phandle_args *parent_spec, + const struct of_phandle_args *subdomain_spec) { return -ENODEV; } -- cgit v1.2.3 From 4d42b37def70327b2bb19f823d42289aed2cd7c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:36 +0000 Subject: net: convert dev->reg_state to u8 Prepares things so that dev->reg_state reads can be lockless, by adding WRITE_ONCE() on write side. READ_ONCE()/WRITE_ONCE() do not support bitfields. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a3f9c95da51e..631124655107 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1815,6 +1815,15 @@ enum netdev_stat_type { NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ }; +enum netdev_reg_state { + NETREG_UNINITIALIZED = 0, + NETREG_REGISTERED, /* completed register_netdevice */ + NETREG_UNREGISTERING, /* called unregister_netdevice */ + NETREG_UNREGISTERED, /* completed unregister todo */ + NETREG_RELEASED, /* called free_netdev */ + NETREG_DUMMY, /* dummy device for NAPI poll */ +}; + /** * struct net_device - The DEVICE structure. * @@ -2372,13 +2381,7 @@ struct net_device { struct list_head link_watch_list; - enum { NETREG_UNINITIALIZED=0, - NETREG_REGISTERED, /* completed register_netdevice */ - NETREG_UNREGISTERING, /* called unregister_netdevice */ - NETREG_UNREGISTERED, /* completed unregister todo */ - NETREG_RELEASED, /* called free_netdev */ - NETREG_DUMMY, /* dummy device for NAPI poll */ - } reg_state:8; + u8 reg_state; bool dismantle; @@ -5254,7 +5257,9 @@ static inline const char *netdev_name(const struct net_device *dev) static inline const char *netdev_reg_state(const struct net_device *dev) { - switch (dev->reg_state) { + u8 reg_state = READ_ONCE(dev->reg_state); + + switch (reg_state) { case NETREG_UNINITIALIZED: return " (uninitialized)"; case NETREG_REGISTERED: return ""; case NETREG_UNREGISTERING: return " (unregistering)"; @@ -5263,7 +5268,7 @@ static inline const char *netdev_reg_state(const struct net_device *dev) case NETREG_DUMMY: return " (dummy)"; } - WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state); + WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state); return " (unknown)"; } -- cgit v1.2.3 From 6a2968ee1ee2cc6fce30f6f5724442b34b1483b3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:42 +0000 Subject: net: add netdev_set_operstate() helper dev_base_lock is going away, add netdev_set_operstate() helper so that hsr does not have to know core internals. Remove dev_base_lock acquisition from rfc2863_policy() v3: use an "unsigned int" for dev->operstate, so that try_cmpxchg() can work on all arches. ( https://lore.kernel.org/oe-kbuild-all/202402081918.OLyGaea3-lkp@intel.com/ ) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/linux/rtnetlink.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 631124655107..697370706a82 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2258,7 +2258,7 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif - unsigned char operstate; + unsigned int operstate; unsigned char link_mode; unsigned char if_port; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 21780608cf47..cdfc897f1e3c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -172,4 +172,6 @@ rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); } +void netdev_set_operstate(struct net_device *dev, int newstate); + #endif /* __LINUX_RTNETLINK_H */ -- cgit v1.2.3 From 1b3ef46cb7f2618cc0b507393220a69810f6da12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:45 +0000 Subject: net: remove dev_base_lock dev_base_lock is not needed anymore, all remaining users also hold RTNL. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 697370706a82..c541550b0e6e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3077,8 +3077,6 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev); int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); -extern rwlock_t dev_base_lock; /* Device list lock */ - #define for_each_netdev(net, d) \ list_for_each_entry(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_reverse(net, d) \ -- cgit v1.2.3 From c90e3ecc91584558d24c82940a3651fdfc174be0 Mon Sep 17 00:00:00 2001 From: Onkarnath Date: Thu, 11 Jan 2024 14:57:22 +0530 Subject: rcu/sync: remove un-used rcu_sync_enter_start function With commit '6a010a49b63a ("cgroup: Make !percpu threadgroup_rwsem operations optional")' usage of rcu_sync_enter_start is removed. So this function can also be removed. In the words of Oleg Nesterov: __rcu_sync_enter(wait => false) is a better alternative if someone needs rcu_sync_enter_start() again. Link: https://lore.kernel.org/all/20220725121208.GB28662@redhat.com/ Signed-off-by: Onkarnath Signed-off-by: Maninder Singh Acked-by: Oleg Nesterov Acked-by: Tejun Heo Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/rcu_sync.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 0027d4c8087c..3860dbb9107a 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -37,7 +37,6 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *); -extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); -- cgit v1.2.3 From 2eb52fa8900e642b3b5054c4bf9776089d2a935f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Dec 2023 09:33:29 -0800 Subject: rcu-tasks: Repair RCU Tasks Trace quiescence check The context-switch-time check for RCU Tasks Trace quiescence expects current->trc_reader_special.b.need_qs to be zero, and if so, updates it to TRC_NEED_QS_CHECKED. This is backwards, because if this value is zero, there is no RCU Tasks Trace grace period in flight, an thus no need for a quiescent state. Instead, when a grace period starts, this field is set to TRC_NEED_QS. This commit therefore changes the check from zero to TRC_NEED_QS. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Tested-by: Steven Rostedt (Google) Signed-off-by: Boqun Feng --- include/linux/rcupdate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0746b1b0b663..16f519914415 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -184,9 +184,9 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t); do { \ int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ \ - if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) && \ + if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ likely(!___rttq_nesting)) { \ - rcu_trc_cmpxchg_need_qs((t), 0, TRC_NEED_QS_CHECKED); \ + rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ rcu_tasks_trace_qs_blkd(t); \ -- cgit v1.2.3 From 3b566b30b41401888ee0e8eb904a1e7a6693794b Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Feb 2024 21:35:15 -0600 Subject: RAS/AMD/ATL: Add MI300 row retirement support DRAM row retirement depends on model-specific information that is best done within the AMD Address Translation Library. Export a generic wrapper function for other modules to use. Add any model-specific helpers here. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com --- include/linux/ras.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ras.h b/include/linux/ras.h index 09c632832bf1..a64182bc72ad 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -45,8 +45,10 @@ struct atl_err { #if IS_ENABLED(CONFIG_AMD_ATL) void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)); void amd_atl_unregister_decoder(void); +void amd_retire_dram_row(struct atl_err *err); unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err); #else +static inline void amd_retire_dram_row(struct atl_err *err) { } static inline unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } #endif /* CONFIG_AMD_ATL */ -- cgit v1.2.3 From 4e77431cda4973f03d063c47f6ea313dfceebf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 9 Jan 2024 22:34:32 +0100 Subject: pwm: Drop useless member .of_pwm_n_cells of struct pwm_chip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apart from the two of_xlate implementations this member is write-only. In the of_xlate functions of_pwm_xlate_with_flags() and of_pwm_single_xlate() it's more sensible to check for args->args_count because this is what is actually used in the device tree. Acked-by: Douglas Anderson Link: https://lore.kernel.org/r/53d8c545aa8f79a920358be9e72e382b3981bdc4.1704835845.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index fcc2c4496f73..8ffe9ae7a23a 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -271,7 +271,6 @@ struct pwm_ops { * @id: unique number of this PWM chip * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier - * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier * @atomic: can the driver's ->apply() be called in atomic context * @pwms: array of PWM devices allocated by the framework */ @@ -284,7 +283,6 @@ struct pwm_chip { struct pwm_device * (*of_xlate)(struct pwm_chip *chip, const struct of_phandle_args *args); - unsigned int of_pwm_n_cells; bool atomic; /* only used internally by the PWM framework */ -- cgit v1.2.3 From 4e59267c7a20eb1c1ad9106e682cb6a0d8eb3111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 14 Feb 2024 10:30:48 +0100 Subject: pwm: Provide an inline function to get the parent device of a given chip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently a pwm_chip stores in its struct device *dev member a pointer to the parent device. Preparing a change that embeds a full struct device in struct pwm_chip, this accessor function should be used in all drivers directly accessing chip->dev now. This way struct pwm_chip and this new function can be changed without having to touch all drivers in the same change set. Make use of this function in the framework's core sources. Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/cc30090d2f9762bed9854a55612144bccc910781.1707900770.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8ffe9ae7a23a..07af6910bdce 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -289,6 +289,11 @@ struct pwm_chip { struct pwm_device *pwms; }; +static inline struct device *pwmchip_parent(const struct pwm_chip *chip) +{ + return chip->dev; +} + #if IS_ENABLED(CONFIG_PWM) /* PWM user APIs */ int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state); -- cgit v1.2.3 From 24003d501f112a9827aa8c59c00e12b2d5cbddf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 14 Feb 2024 10:30:49 +0100 Subject: pwm: Provide wrappers for storing and getting driver private data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions are useful to store and query driver private data depending on the pwm_chip. After struct pwm_chip got its own struct device, this can make use of dev_get_drvdata() and dev_set_drvdata() on that device. These functions are required already now to convert drivers to pwmchip_alloc() which must happen before changing pwm_chip::dev. Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/67514cdf29d29bd8b4ad8d44fac87f6ae6dca1e5.1707900770.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 07af6910bdce..29a7d9140f77 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -272,6 +272,7 @@ struct pwm_ops { * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier * @atomic: can the driver's ->apply() be called in atomic context + * @driver_data: Private pointer for driver specific info * @pwms: array of PWM devices allocated by the framework */ struct pwm_chip { @@ -286,6 +287,7 @@ struct pwm_chip { bool atomic; /* only used internally by the PWM framework */ + void *driver_data; struct pwm_device *pwms; }; @@ -294,6 +296,24 @@ static inline struct device *pwmchip_parent(const struct pwm_chip *chip) return chip->dev; } +static inline void *pwmchip_get_drvdata(struct pwm_chip *chip) +{ + /* + * After pwm_chip got a dedicated struct device, this can be replaced by + * dev_get_drvdata(&chip->dev); + */ + return chip->driver_data; +} + +static inline void pwmchip_set_drvdata(struct pwm_chip *chip, void *data) +{ + /* + * After pwm_chip got a dedicated struct device, this can be replaced by + * dev_set_drvdata(&chip->dev, data); + */ + chip->driver_data = data; +} + #if IS_ENABLED(CONFIG_PWM) /* PWM user APIs */ int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state); -- cgit v1.2.3 From 024913dbf99f06f09d5c06c940ca39821697d41c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 14 Feb 2024 10:30:50 +0100 Subject: pwm: Provide pwmchip_alloc() function and a devm variant of it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function allocates a struct pwm_chip and driver data. Compared to the status quo the split into pwm_chip and driver data is new, otherwise it doesn't change anything relevant (yet). The intention is that after all drivers are switched to use this allocation function, its possible to add a struct device to struct pwm_chip to properly track the latter's lifetime without touching all drivers again. Proper lifetime tracking is a necessary precondition to introduce character device support for PWMs (that implements atomic setting and doesn't suffer from the sysfs overhead of the /sys/class/pwm userspace support). The new function pwmchip_priv() (obviously?) only works for chips allocated with pwmchip_alloc(). Link: https://lore.kernel.org/r/9577d6053a5a52536057dc8654ff567181c2da82.1707900770.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 29a7d9140f77..4a6568dfdf3f 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -403,6 +403,10 @@ static inline bool pwm_might_sleep(struct pwm_device *pwm) int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result, unsigned long timeout); +void pwmchip_put(struct pwm_chip *chip); +struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv); +struct pwm_chip *devm_pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv); + int __pwmchip_add(struct pwm_chip *chip, struct module *owner); #define pwmchip_add(chip) __pwmchip_add(chip, THIS_MODULE) void pwmchip_remove(struct pwm_chip *chip); @@ -475,6 +479,24 @@ static inline int pwm_capture(struct pwm_device *pwm, return -EINVAL; } +static inline void pwmchip_put(struct pwm_chip *chip) +{ +} + +static inline struct pwm_chip *pwmchip_alloc(struct device *parent, + unsigned int npwm, + size_t sizeof_priv) +{ + return ERR_PTR(-EINVAL); +} + +static inline struct pwm_chip *devm_pwmchip_alloc(struct device *parent, + unsigned int npwm, + size_t sizeof_priv) +{ + return pwmchip_alloc(parent, npwm, sizeof_priv); +} + static inline int pwmchip_add(struct pwm_chip *chip) { return -EINVAL; -- cgit v1.2.3 From 81800aef0eba33df2b30f2e29a0137078b9ba256 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 13 Feb 2024 11:48:00 -0300 Subject: net: mdio_bus: make mdio_bus_type const Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the mdio_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Andrew Lunn Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240213-bus_cleanup-mdio-v1-1-f9e799da7fda@marliere.net Signed-off-by: Paolo Abeni --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2249cdb5957a..c2dda21b39e1 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2129,7 +2129,7 @@ static inline bool phy_package_probe_once(struct phy_device *phydev) return __phy_package_set_once(phydev, PHY_SHARED_F_PROBE_DONE); } -extern struct bus_type mdio_bus_type; +extern const struct bus_type mdio_bus_type; struct mdio_board_info { const char *bus_id; -- cgit v1.2.3 From a5fcea2d2f790aa90b6e996d411ae2cf8db55186 Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Tue, 13 Feb 2024 23:26:31 -0700 Subject: net: ipv6/addrconf: introduce a regen_min_advance sysctl In RFC 8981, REGEN_ADVANCE cannot be less than 2 seconds, and the RFC does not permit the creation of temporary addresses with lifetimes shorter than that: > When processing a Router Advertisement with a > Prefix Information option carrying a prefix for the purposes of > address autoconfiguration (i.e., the A bit is set), the host MUST > perform the following steps: > 5. A temporary address is created only if this calculated preferred > lifetime is greater than REGEN_ADVANCE time units. However, some users want to change their IPv6 address as frequently as possible regardless of the RFC's arbitrary minimum lifetime. For the benefit of those users, add a regen_min_advance sysctl parameter that can be set to below or above 2 seconds. Link: https://datatracker.ietf.org/doc/html/rfc8981 Signed-off-by: Alex Henrie Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 5e605e384aac..ef3aa060a289 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -27,6 +27,7 @@ struct ipv6_devconf { __s32 use_tempaddr; __s32 temp_valid_lft; __s32 temp_prefered_lft; + __s32 regen_min_advance; __s32 regen_max_retry; __s32 max_desync_factor; __s32 max_addresses; -- cgit v1.2.3 From 4ff4c745a16c4c151a71863420811e7f406c3ec2 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 31 Jan 2024 15:49:35 +0100 Subject: locking: Introduce prepare_sync_core_cmd() Introduce an architecture function that architectures can use to set up ("prepare") SYNC_CORE commands. The function will be used by RISC-V to update its "deferred icache- flush" data structures (icache_stale_mask). Architectures defining prepare_sync_core_cmd() static inline need to select ARCH_HAS_PREPARE_SYNC_CORE_CMD. Suggested-by: Mathieu Desnoyers Signed-off-by: Andrea Parri Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20240131144936.29190-4-parri.andrea@gmail.com Signed-off-by: Palmer Dabbelt --- include/linux/sync_core.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h index 013da4b8b327..67bb9794b875 100644 --- a/include/linux/sync_core.h +++ b/include/linux/sync_core.h @@ -17,5 +17,19 @@ static inline void sync_core_before_usermode(void) } #endif -#endif /* _LINUX_SYNC_CORE_H */ +#ifdef CONFIG_ARCH_HAS_PREPARE_SYNC_CORE_CMD +#include +#else +/* + * This is a dummy prepare_sync_core_cmd() implementation that can be used on + * all architectures which provide unconditional core serializing instructions + * in switch_mm(). + * If your architecture doesn't provide such core serializing instructions in + * switch_mm(), you may need to write your own functions. + */ +static inline void prepare_sync_core_cmd(struct mm_struct *mm) +{ +} +#endif +#endif /* _LINUX_SYNC_CORE_H */ -- cgit v1.2.3 From ac81e94ab001c2882e89c9b61417caea64b800df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:31 +0530 Subject: genirq/msi: Extend msi_parent_ops Supporting per device MSI domains on ARM64, RISC-V and the zoo of interrupt mechanisms needs a bit more information than what the initial x86 implementation provides. Add the following fields: - required_flags: The flags which a parent domain requires to be set - bus_select_token: The bus token of the parent domain for select() - bus_select_mask: A bitmask of supported child domain bus types This allows to provide library functions which can be shared between various interrupt chip implementations and avoids replicating mostly similar code all over the place. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-4-apatel@ventanamicro.com --- include/linux/msi.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index ddace8c34dcf..d5d1513ef4d6 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -572,6 +572,11 @@ enum { * struct msi_parent_ops - MSI parent domain callbacks and configuration info * * @supported_flags: Required: The supported MSI flags of the parent domain + * @required_flags: Optional: The required MSI flags of the parent MSI domain + * @bus_select_token: Optional: The bus token of the real parent domain for + * irq_domain::select() + * @bus_select_mask: Optional: A mask of supported BUS_DOMAINs for + * irq_domain::select() * @prefix: Optional: Prefix for the domain and chip name * @init_dev_msi_info: Required: Callback for MSI parent domains to setup parent * domain specific domain flags, domain ops and interrupt chip @@ -579,6 +584,9 @@ enum { */ struct msi_parent_ops { u32 supported_flags; + u32 required_flags; + u32 bus_select_token; + u32 bus_select_mask; const char *prefix; bool (*init_dev_msi_info)(struct device *dev, struct irq_domain *domain, struct irq_domain *msi_parent_domain, -- cgit v1.2.3 From 6516d5a295356f8fd5827a1c0954d7ed5b2324dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:32 +0530 Subject: genirq/irqdomain: Add DOMAIN_BUS_DEVICE_MSI Add a new domain bus token to prepare for device MSI which aims to replace the existing platform MSI maze. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-5-apatel@ventanamicro.com --- include/linux/irqdomain_defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h index c29921fd8cd1..a7dea0c8c5e0 100644 --- a/include/linux/irqdomain_defs.h +++ b/include/linux/irqdomain_defs.h @@ -26,6 +26,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_DMAR, DOMAIN_BUS_AMDVI, DOMAIN_BUS_PCI_DEVICE_IMS, + DOMAIN_BUS_DEVICE_MSI, }; #endif /* _LINUX_IRQDOMAIN_DEFS_H */ -- cgit v1.2.3 From c88f9110bfbca5975a8dee4c9792ba12684c7bca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:33 +0530 Subject: platform-msi: Prepare for real per device domains Provide functions to create and remove per device MSI domains which replace the platform-MSI domains. The new model is that each of the devices which utilize platform-MSI gets now its private MSI domain which is "customized" in size and with a device specific function to write the MSI message into the device. This is the same functionality as platform-MSI but it avoids all the down sides of platform MSI, i.e. the extra ID book keeping, the special data structure in the msi descriptor. Further the domains are only created when the devices are really in use, so the burden is on the usage and not on the infrastructure. Fill in the domain template and provide two functions to init/allocate and remove a per device MSI domain. Until all users and parent domain providers are converted, the init/alloc function invokes the original platform-MSI code when the irqdomain which is associated to the device does not provide MSI parent functionality yet. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-6-apatel@ventanamicro.com --- include/linux/msi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index d5d1513ef4d6..ef167961c782 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -664,6 +664,10 @@ int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int vir void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nvec); void *platform_msi_get_host_data(struct irq_domain *domain); +/* Per device platform MSI */ +int platform_device_msi_init_and_alloc_irqs(struct device *dev, unsigned int nvec, + irq_write_msi_msg_t write_msi_msg); +void platform_device_msi_free_irqs_all(struct device *dev); bool msi_device_has_isolated_msi(struct device *dev); #else /* CONFIG_GENERIC_MSI_IRQ */ -- cgit v1.2.3 From 1a4671ff7a903e87e4e76213e200bb8bcfa942e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Feb 2024 16:35:43 +0100 Subject: platform-msi: Remove unused interfaces Signed-off-by: Thomas Gleixner --- include/linux/msi.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index ef167961c782..b0842ea55bde 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -635,9 +635,6 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain); struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); -int platform_msi_domain_alloc_irqs(struct device *dev, unsigned int nvec, - irq_write_msi_msg_t write_msi_msg); -void platform_msi_domain_free_irqs(struct device *dev); /* When an MSI domain is used as an intermediate domain */ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, -- cgit v1.2.3 From 9c78c1a85c04bdfbccc5a50588e001087d942b08 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:35 +0530 Subject: genirq/msi: Provide optional translation op irq_create_fwspec_mapping() requires translation of the firmware spec to a hardware interrupt number and the trigger type information. Wired interrupts which are connected to a wire to MSI bridge, like MBIGEN are allocated that way. So far MBIGEN provides a regular irqdomain which then hooks backwards into the MSI infrastructure. That's an unholy mess and will be replaced with per device MSI domains which are regular MSI domains. Interrupts on MSI domains are not supported by irq_create_fwspec_mapping(), but for making the wire to MSI bridges sane it makes sense to provide a special allocation/free interface in the MSI infrastructure. That avoids the backdoors into the core MSI allocation code and just shares all the regular MSI infrastructure. Provide an optional translation callback in msi_domain_ops which can be utilized by these wire to MSI bridges. No other MSI domain should provide a translation callback. The default translation callback of the MSI irqdomains will warn when it is invoked on a non-prepared MSI domain. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-8-apatel@ventanamicro.com --- include/linux/msi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b0842ea55bde..24a54248f1ae 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -412,6 +412,7 @@ bool arch_restore_msi_irqs(struct pci_dev *dev); struct irq_domain; struct irq_domain_ops; struct irq_chip; +struct irq_fwspec; struct device_node; struct fwnode_handle; struct msi_domain_info; @@ -431,6 +432,8 @@ struct msi_domain_info; * function. * @msi_post_free: Optional function which is invoked after freeing * all interrupts. + * @msi_translate: Optional translate callback to support the odd wire to + * MSI bridges, e.g. MBIGEN * * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying * irqdomain. @@ -468,6 +471,8 @@ struct msi_domain_ops { struct device *dev); void (*msi_post_free)(struct irq_domain *domain, struct device *dev); + int (*msi_translate)(struct irq_domain *domain, struct irq_fwspec *fwspec, + irq_hw_number_t *hwirq, unsigned int *type); }; /** -- cgit v1.2.3 From 2d566a498d6483ba986dadc496f64a20b032608f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:37 +0530 Subject: genirq/msi: Provide DOMAIN_BUS_WIRED_TO_MSI Provide a domain bus token for the upcoming support for wire to MSI device domains so the domain can be distinguished from regular device MSI domains. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-10-apatel@ventanamicro.com --- include/linux/irqdomain_defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h index a7dea0c8c5e0..5c1fe6f1fcde 100644 --- a/include/linux/irqdomain_defs.h +++ b/include/linux/irqdomain_defs.h @@ -27,6 +27,7 @@ enum irq_domain_bus_token { DOMAIN_BUS_AMDVI, DOMAIN_BUS_PCI_DEVICE_IMS, DOMAIN_BUS_DEVICE_MSI, + DOMAIN_BUS_WIRED_TO_MSI, }; #endif /* _LINUX_IRQDOMAIN_DEFS_H */ -- cgit v1.2.3 From 9d1c58c8004653b37721dd7b16f4360216778c94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:38 +0530 Subject: genirq/msi: Optionally use dev->fwnode for device domain To support wire to MSI domains via the MSI infrastructure it is required to use the firmware node of the device which implements this for creating the MSI domain. Otherwise the existing firmware match mechanisms to find the correct irqdomain for a wired interrupt which is connected to a wire to MSI bridge would fail. This cannot be used for the general case because not all devices provide firmware nodes and all regular per device MSI domains are directly associated to the device and have not be searched for. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-11-apatel@ventanamicro.com --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 24a54248f1ae..36ba6a0852ea 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -552,6 +552,8 @@ enum { MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 5), /* Free MSI descriptors */ MSI_FLAG_FREE_MSI_DESCS = (1 << 6), + /* Use dev->fwnode for MSI device domain creation */ + MSI_FLAG_USE_DEV_FWNODE = (1 << 7), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), -- cgit v1.2.3 From 0ee1578b00bcf5ef8e7955f0c6f02a624443eb29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:39 +0530 Subject: genirq/msi: Provide allocation/free functions for "wired" MSI interrupts To support wire to MSI bridges proper in the MSI core infrastructure it is required to have separate allocation/free interfaces which can be invoked from the regular irqdomain allocaton/free functions. The mechanism for allocation is: - Allocate the next free MSI descriptor index in the domain - Store the hardware interrupt number and the trigger type which was extracted by the irqdomain core from the firmware spec in the MSI descriptor device cookie so it can be retrieved by the underlying interrupt domain and interrupt chip - Use the regular MSI allocation mechanism for the newly allocated index which returns a fully initialized Linux interrupt on succes This works because: - the domains have a fixed size - each hardware interrupt is only allocated once - the underlying domain does not care about the MSI index it only cares about the hardware interrupt number and the trigger type The free function looks up the MSI index in the MSI descriptor of the provided Linux interrupt number and uses the regular index based free functions of the MSI core. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-12-apatel@ventanamicro.com --- include/linux/irqdomain.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index ee0a82c60508..21ecf582a0fe 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -619,6 +619,23 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain) #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ +#ifdef CONFIG_GENERIC_MSI_IRQ +int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, + unsigned int type); +void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq); +#else +static inline int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, + unsigned int type) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} +static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) +{ + WARN_ON_ONCE(1); +} +#endif + #else /* CONFIG_IRQ_DOMAIN */ static inline void irq_dispose_mapping(unsigned int virq) { } static inline struct irq_domain *irq_find_matching_fwnode( -- cgit v1.2.3 From 9bbe13a5d414a7f8208dba64b54d2b6e4f7086bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2024 21:47:41 +0530 Subject: genirq/msi: Provide MSI_FLAG_PARENT_PM_DEV Some platform-MSI implementations require that power management is redirected to the underlying interrupt chip device. To make this work with per device MSI domains provide a new feature flag and let the core code handle the setup of dev->pm_dev when set during device MSI domain creation. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240127161753.114685-14-apatel@ventanamicro.com --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 36ba6a0852ea..26d07e23052e 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -554,6 +554,8 @@ enum { MSI_FLAG_FREE_MSI_DESCS = (1 << 6), /* Use dev->fwnode for MSI device domain creation */ MSI_FLAG_USE_DEV_FWNODE = (1 << 7), + /* Set parent->dev into domain->pm_dev on device domain creation */ + MSI_FLAG_PARENT_PM_DEV = (1 << 8), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), -- cgit v1.2.3 From 8a566f94104df87a067458351675129bb4e1ece2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Feb 2024 16:22:55 +0200 Subject: seq_buf: Don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Link: https://lkml.kernel.org/r/20240215142255.400264-1-andriy.shevchenko@linux.intel.com Cc: "Matthew Wilcox (Oracle)" Cc: Andrew Morton Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index c44f4b47b945..07b26e751060 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -2,7 +2,10 @@ #ifndef _LINUX_SEQ_BUF_H #define _LINUX_SEQ_BUF_H -#include +#include +#include +#include +#include /* * Trace sequences are used to allow a function to call several other functions -- cgit v1.2.3 From 6efe4d18796934b8ada66c1c446510e7f2d9b972 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Feb 2024 17:25:06 +0200 Subject: seq_buf: Fix kernel documentation There are plenty of issues with the kernel documentation here: - misspelled word "sequence" - different style of returned value descriptions - missed Return sections - unaligned style of ASCII / NUL-terminated / etc - wrong function references Fix all these. Link: https://lkml.kernel.org/r/20240215152506.598340-1-andriy.shevchenko@linux.intel.com Cc: Andrew Morton Signed-off-by: Andy Shevchenko Reviewed-by: Randy Dunlap Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 07b26e751060..fe41da005970 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -13,7 +13,7 @@ */ /** - * seq_buf - seq buffer structure + * struct seq_buf - seq buffer structure * @buffer: pointer to the buffer * @size: size of the buffer * @len: the amount of data inside the buffer @@ -80,10 +80,10 @@ static inline unsigned int seq_buf_used(struct seq_buf *s) } /** - * seq_buf_str - get %NUL-terminated C string from seq_buf + * seq_buf_str - get NUL-terminated C string from seq_buf * @s: the seq_buf handle * - * This makes sure that the buffer in @s is nul terminated and + * This makes sure that the buffer in @s is NUL-terminated and * safe to read as a string. * * Note, if this is called when the buffer has overflowed, then @@ -93,7 +93,7 @@ static inline unsigned int seq_buf_used(struct seq_buf *s) * After this function is called, s->buffer is safe to use * in string operations. * - * Returns @s->buf after making sure it is terminated. + * Returns: @s->buf after making sure it is terminated. */ static inline const char *seq_buf_str(struct seq_buf *s) { @@ -113,7 +113,7 @@ static inline const char *seq_buf_str(struct seq_buf *s) * @s: the seq_buf handle * @bufp: the beginning of the buffer is stored here * - * Return the number of bytes available in the buffer, or zero if + * Returns: the number of bytes available in the buffer, or zero if * there's no space. */ static inline size_t seq_buf_get_buf(struct seq_buf *s, char **bufp) @@ -135,7 +135,7 @@ static inline size_t seq_buf_get_buf(struct seq_buf *s, char **bufp) * @num: the number of bytes to commit * * Commit @num bytes of data written to a buffer previously acquired - * by seq_buf_get. To signal an error condition, or that the data + * by seq_buf_get_buf(). To signal an error condition, or that the data * didn't fit in the available space, pass a negative @num value. */ static inline void seq_buf_commit(struct seq_buf *s, int num) -- cgit v1.2.3 From 68fb3ca0e408e00db1c3f8fccdfa19e274c033be Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Feb 2024 11:14:33 -0800 Subject: update workarounds for gcc "asm goto" issue In commit 4356e9f841f7 ("work around gcc bugs with 'asm goto' with outputs") I did the gcc workaround unconditionally, because the cause of the bad code generation wasn't entirely clear. In the meantime, Jakub Jelinek debugged the issue, and has come up with a fix in gcc [2], which also got backported to the still maintained branches of gcc-11, gcc-12 and gcc-13. Note that while the fix technically wasn't in the original gcc-14 branch, Jakub says: "while it is true that no GCC 14 snapshots until today (or whenever the fix will be committed) have the fix, for GCC trunk it is up to the distros to use the latest snapshot if they use it at all and would allow better testing of the kernel code without the workaround, so that if there are other issues they won't be discovered years later. Most userland code doesn't actually use asm goto with outputs..." so we will consider gcc-14 to be fixed - if somebody is using gcc snapshots of the gcc-14 before the fix, they should upgrade. Note that while the bug goes back to gcc-11, in practice other gcc changes seem to have effectively hidden it since gcc-12.1 as per a bisect by Jakub. So even a gcc-14 snapshot without the fix likely doesn't show actual problems. Also, make the default 'asm_goto_output()' macro mark the asm as volatile by hand, because of an unrelated gcc issue [1] where it doesn't match the documented behavior ("asm goto is always volatile"). Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103979 [1] Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 [2] Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/ Requested-by: Jakub Jelinek Cc: Uros Bizjak Cc: Nick Desaulniers Cc: Sean Christopherson Cc: Andrew Pinski Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 7 ++++--- include/linux/compiler_types.h | 9 ++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index c1a963be7d28..75bd1692d2e3 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -67,10 +67,9 @@ /* * GCC 'asm goto' with outputs miscompiles certain code sequences: * - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 * - * Work it around via the same compiler barrier quirk that we used + * Work around it via the same compiler barrier quirk that we used * to use for the old 'asm goto' workaround. * * Also, always mark such 'asm goto' statements as volatile: all @@ -80,8 +79,10 @@ * * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 */ +#ifdef CONFIG_GCC_ASM_GOTO_OUTPUT_WORKAROUND #define asm_goto_output(x...) \ do { asm volatile goto(x); asm (""); } while (0) +#endif #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 663d8791c871..0caf354cb94b 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -362,8 +362,15 @@ struct ftrace_likely_data { #define __member_size(p) __builtin_object_size(p, 1) #endif +/* + * Some versions of gcc do not mark 'asm goto' volatile: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103979 + * + * We do it here by hand, because it doesn't hurt. + */ #ifndef asm_goto_output -#define asm_goto_output(x...) asm goto(x) +#define asm_goto_output(x...) asm volatile goto(x) #endif #ifdef CONFIG_CC_HAS_ASM_INLINE -- cgit v1.2.3 From 1269b6d7222f761b6f5fb85b19f7ab76a5bbf803 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 13 Feb 2024 15:46:05 +0200 Subject: ACPI: utils: Make acpi_handle_path() not static acpi_handle_path() will soon be required for node name comparison elsewhere in ACPI framework. Remove the static keyword and add the prototype to include/linux/acpi.h. Signed-off-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index b7165e52b3c6..a170c389dd74 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1170,6 +1170,7 @@ static inline void acpi_ec_set_gpe_wake_mask(u8 action) {} #endif #ifdef CONFIG_ACPI +char *acpi_handle_path(acpi_handle handle); __printf(3, 4) void acpi_handle_printk(const char *level, acpi_handle handle, const char *fmt, ...); -- cgit v1.2.3 From b4ccc4dd1330a4d0db6aa4c6781631d1bab76c45 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Feb 2024 15:30:33 -0700 Subject: io_uring/napi: enable even with a timeout of 0 1 usec is not as short as it used to be, and it makes sense to allow 0 for a busy poll timeout - this means just do one loop to check if we have anything available. Add a separate ->napi_enabled to check if napi has been enabled or not. While at it, move the writing of the ctx napi values after we've copied the old values back to userspace. This ensures that if the call fails, we'll be in the same state as we were before, rather than some indeterminate state. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4fe7af8a4907..bd7071aeec5d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -420,6 +420,7 @@ struct io_ring_ctx { /* napi busy poll default timeout */ unsigned int napi_busy_poll_to; bool napi_prefer_busy_poll; + bool napi_enabled; DECLARE_HASHTABLE(napi_ht, 4); #endif -- cgit v1.2.3 From bad5247a2c4f7eab6fb922af3362740a562dc665 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:49 +0100 Subject: ima: Align ima_inode_post_setattr() definition with LSM infrastructure Change ima_inode_post_setattr() definition, so that it can be registered as implementation of the inode_post_setattr hook (to be introduced). Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/ima.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 86b57757c7b1..910a2f11a906 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -186,7 +186,7 @@ static inline void ima_post_key_create_or_update(struct key *keyring, #ifdef CONFIG_IMA_APPRAISE extern bool is_ima_appraise_enabled(void); extern void ima_inode_post_setattr(struct mnt_idmap *idmap, - struct dentry *dentry); + struct dentry *dentry, int ia_valid); extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); extern int ima_inode_set_acl(struct mnt_idmap *idmap, @@ -206,7 +206,7 @@ static inline bool is_ima_appraise_enabled(void) } static inline void ima_inode_post_setattr(struct mnt_idmap *idmap, - struct dentry *dentry) + struct dentry *dentry, int ia_valid) { return; } -- cgit v1.2.3 From 0298c5a9b168f0d74ea3bf881301c4bd9252d367 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:50 +0100 Subject: ima: Align ima_file_mprotect() definition with LSM infrastructure Change ima_file_mprotect() definition, so that it can be registered as implementation of the file_mprotect hook. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/ima.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 910a2f11a906..b66353f679e8 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -23,7 +23,8 @@ extern void ima_post_create_tmpfile(struct mnt_idmap *idmap, extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags); -extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot); +extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, + unsigned long prot); extern int ima_load_data(enum kernel_load_data_id id, bool contents); extern int ima_post_load_data(char *buf, loff_t size, enum kernel_load_data_id id, char *description); @@ -84,7 +85,7 @@ static inline int ima_file_mmap(struct file *file, unsigned long reqprot, } static inline int ima_file_mprotect(struct vm_area_struct *vma, - unsigned long prot) + unsigned long reqprot, unsigned long prot) { return 0; } -- cgit v1.2.3 From fbd0506e5c5874fd52403fd38e3e77d895689870 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:51 +0100 Subject: ima: Align ima_inode_setxattr() definition with LSM infrastructure Change ima_inode_setxattr() definition, so that it can be registered as implementation of the inode_setxattr hook. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/ima.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index b66353f679e8..077324309c11 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -188,8 +188,9 @@ static inline void ima_post_key_create_or_update(struct key *keyring, extern bool is_ima_appraise_enabled(void); extern void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid); -extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, - const void *xattr_value, size_t xattr_value_len); +extern int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, + const char *xattr_name, const void *xattr_value, + size_t xattr_value_len, int flags); extern int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); @@ -212,10 +213,12 @@ static inline void ima_inode_post_setattr(struct mnt_idmap *idmap, return; } -static inline int ima_inode_setxattr(struct dentry *dentry, +static inline int ima_inode_setxattr(struct mnt_idmap *idmap, + struct dentry *dentry, const char *xattr_name, const void *xattr_value, - size_t xattr_value_len) + size_t xattr_value_len, + int flags) { return 0; } -- cgit v1.2.3 From 526864dd2f60c16bcdc84e9e7bc6d69d08cfee21 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:52 +0100 Subject: ima: Align ima_inode_removexattr() definition with LSM infrastructure Change ima_inode_removexattr() definition, so that it can be registered as implementation of the inode_removexattr hook. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/ima.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 077324309c11..678a03fddd7e 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -200,7 +200,9 @@ static inline int ima_inode_remove_acl(struct mnt_idmap *idmap, { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } -extern int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name); + +extern int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, + const char *xattr_name); #else static inline bool is_ima_appraise_enabled(void) { @@ -231,7 +233,8 @@ static inline int ima_inode_set_acl(struct mnt_idmap *idmap, return 0; } -static inline int ima_inode_removexattr(struct dentry *dentry, +static inline int ima_inode_removexattr(struct mnt_idmap *idmap, + struct dentry *dentry, const char *xattr_name) { return 0; -- cgit v1.2.3 From fec5f85e468d6f47851d531ec81f9da821768e00 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:53 +0100 Subject: ima: Align ima_post_read_file() definition with LSM infrastructure Change ima_post_read_file() definition, by making "void *buf" a "char *buf", so that it can be registered as implementation of the post_read_file hook. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/ima.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 678a03fddd7e..31ef6c3c3207 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -30,7 +30,7 @@ extern int ima_post_load_data(char *buf, loff_t size, enum kernel_load_data_id id, char *description); extern int ima_read_file(struct file *file, enum kernel_read_file_id id, bool contents); -extern int ima_post_read_file(struct file *file, void *buf, loff_t size, +extern int ima_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id id); extern void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry); @@ -108,7 +108,7 @@ static inline int ima_read_file(struct file *file, enum kernel_read_file_id id, return 0; } -static inline int ima_post_read_file(struct file *file, void *buf, loff_t size, +static inline int ima_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) { return 0; -- cgit v1.2.3 From 784111d0093e007950cc20033daf3d74ac388821 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:54 +0100 Subject: evm: Align evm_inode_post_setattr() definition with LSM infrastructure Change evm_inode_post_setattr() definition, so that it can be registered as implementation of the inode_post_setattr hook (to be introduced). Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/evm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index 36ec884320d9..5cc386312b5a 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -23,7 +23,8 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry, struct integrity_iint_cache *iint); extern int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -extern void evm_inode_post_setattr(struct dentry *dentry, int ia_valid); +extern void evm_inode_post_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, int ia_valid); extern int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size); @@ -98,7 +99,8 @@ static inline int evm_inode_setattr(struct mnt_idmap *idmap, return 0; } -static inline void evm_inode_post_setattr(struct dentry *dentry, int ia_valid) +static inline void evm_inode_post_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, int ia_valid) { return; } -- cgit v1.2.3 From 2b6a4054f8c2758cf5c1d78f6ba7006a940b31ce Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:55 +0100 Subject: evm: Align evm_inode_setxattr() definition with LSM infrastructure Change evm_inode_setxattr() definition, so that it can be registered as implementation of the inode_setxattr hook. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/evm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index 5cc386312b5a..7de24c1ada90 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -27,7 +27,7 @@ extern void evm_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid); extern int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, - const void *value, size_t size); + const void *value, size_t size, int flags); extern void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, @@ -107,7 +107,7 @@ static inline void evm_inode_post_setattr(struct mnt_idmap *idmap, static inline int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, - const void *value, size_t size) + const void *value, size_t size, int flags) { return 0; } -- cgit v1.2.3 From 779cb1947e270504c66a96fc0b7e6e31c748b1e1 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:56 +0100 Subject: evm: Align evm_inode_post_setxattr() definition with LSM infrastructure Change evm_inode_post_setxattr() definition, so that it can be registered as implementation of the inode_post_setxattr hook. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/evm.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index 7de24c1ada90..3faabdd47852 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -31,7 +31,8 @@ extern int evm_inode_setxattr(struct mnt_idmap *idmap, extern void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, - size_t xattr_value_len); + size_t xattr_value_len, + int flags); extern int evm_inode_copy_up_xattr(const char *name); extern int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name); @@ -56,7 +57,7 @@ static inline void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { - return evm_inode_post_setxattr(dentry, acl_name, NULL, 0); + return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0); } int evm_inode_init_security(struct inode *inode, struct inode *dir, @@ -115,7 +116,8 @@ static inline int evm_inode_setxattr(struct mnt_idmap *idmap, static inline void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, - size_t xattr_value_len) + size_t xattr_value_len, + int flags) { return; } -- cgit v1.2.3 From 314a8dc728d038378795236f6b5199265f921f45 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:57 +0100 Subject: security: Align inode_setattr hook definition with EVM Add the idmap parameter to the definition, so that evm_inode_setattr() can be registered as this hook implementation. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Acked-by: Casey Schaufler Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 76458b6d53da..b00b16d58413 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -135,7 +135,8 @@ LSM_HOOK(int, 0, inode_readlink, struct dentry *dentry) LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode, bool rcu) LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask) -LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr) +LSM_HOOK(int, 0, inode_setattr, struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) LSM_HOOK(int, 0, inode_getattr, const struct path *path) LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, -- cgit v1.2.3 From 77fa6f314f0376176ef6bf3d84403e0d8b54ce28 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:58 +0100 Subject: security: Introduce inode_post_setattr hook In preparation for moving IMA and EVM to the LSM infrastructure, introduce the inode_post_setattr hook. At inode_setattr hook, EVM verifies the file's existing HMAC value. At inode_post_setattr, EVM re-calculates the file's HMAC based on the modified file attributes and other file metadata. Other LSMs could similarly take some action after successful file attribute change. The new hook cannot return an error and cannot cause the operation to be reverted. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Acked-by: Casey Schaufler Acked-by: Christian Brauner Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index b00b16d58413..a0e9e48015a4 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -137,6 +137,8 @@ LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode, LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask) LSM_HOOK(int, 0, inode_setattr, struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) +LSM_HOOK(void, LSM_RET_VOID, inode_post_setattr, struct mnt_idmap *idmap, + struct dentry *dentry, int ia_valid) LSM_HOOK(int, 0, inode_getattr, const struct path *path) LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, diff --git a/include/linux/security.h b/include/linux/security.h index d0eb20f90b26..56c841aa3994 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -361,6 +361,8 @@ int security_inode_follow_link(struct dentry *dentry, struct inode *inode, int security_inode_permission(struct inode *inode, int mask); int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); +void security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + int ia_valid); int security_inode_getattr(const struct path *path); int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, @@ -879,6 +881,11 @@ static inline int security_inode_setattr(struct mnt_idmap *idmap, return 0; } +static inline void +security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + int ia_valid) +{ } + static inline int security_inode_getattr(const struct path *path) { return 0; -- cgit v1.2.3 From dae52cbf5887ac51c3574648124cfe475a9b3246 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:30:59 +0100 Subject: security: Introduce inode_post_removexattr hook In preparation for moving IMA and EVM to the LSM infrastructure, introduce the inode_post_removexattr hook. At inode_removexattr hook, EVM verifies the file's existing HMAC value. At inode_post_removexattr, EVM re-calculates the file's HMAC with the passed xattr removed and other file metadata. Other LSMs could similarly take some action after successful xattr removal. The new hook cannot return an error and cannot cause the operation to be reverted. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler Acked-by: Christian Brauner Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index a0e9e48015a4..f849f7d5bb53 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -149,6 +149,8 @@ LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry) LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name) +LSM_HOOK(void, LSM_RET_VOID, inode_post_removexattr, struct dentry *dentry, + const char *name) LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap, diff --git a/include/linux/security.h b/include/linux/security.h index 56c841aa3994..84ae03690340 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -380,6 +380,7 @@ int security_inode_getxattr(struct dentry *dentry, const char *name); int security_inode_listxattr(struct dentry *dentry); int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name); +void security_inode_post_removexattr(struct dentry *dentry, const char *name); int security_inode_need_killpriv(struct dentry *dentry); int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry); int security_inode_getsecurity(struct mnt_idmap *idmap, @@ -942,6 +943,10 @@ static inline int security_inode_removexattr(struct mnt_idmap *idmap, return cap_inode_removexattr(idmap, dentry, name); } +static inline void security_inode_post_removexattr(struct dentry *dentry, + const char *name) +{ } + static inline int security_inode_need_killpriv(struct dentry *dentry) { return cap_inode_need_killpriv(dentry); -- cgit v1.2.3 From 8f46ff5767b0b18329140d80d6bcabd818f42c4c Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:00 +0100 Subject: security: Introduce file_post_open hook In preparation to move IMA and EVM to the LSM infrastructure, introduce the file_post_open hook. Also, export security_file_post_open() for NFS. Based on policy, IMA calculates the digest of the file content and extends the TPM with the digest, verifies the file's integrity based on the digest, and/or includes the file digest in the audit log. LSMs could similarly take action depending on the file content and the access mask requested with open(). The new hook returns a value and can cause the open to be aborted. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Acked-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Christian Brauner Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + include/linux/security.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index f849f7d5bb53..3c84942d2818 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -191,6 +191,7 @@ LSM_HOOK(int, 0, file_send_sigiotask, struct task_struct *tsk, struct fown_struct *fown, int sig) LSM_HOOK(int, 0, file_receive, struct file *file) LSM_HOOK(int, 0, file_open, struct file *file) +LSM_HOOK(int, 0, file_post_open, struct file *file, int mask) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, unsigned long clone_flags) diff --git a/include/linux/security.h b/include/linux/security.h index 84ae03690340..97f2212c13b6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -411,6 +411,7 @@ int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig); int security_file_receive(struct file *file); int security_file_open(struct file *file); +int security_file_post_open(struct file *file, int mask); int security_file_truncate(struct file *file); int security_task_alloc(struct task_struct *task, unsigned long clone_flags); void security_task_free(struct task_struct *task); @@ -1074,6 +1075,11 @@ static inline int security_file_open(struct file *file) return 0; } +static inline int security_file_post_open(struct file *file, int mask) +{ + return 0; +} + static inline int security_file_truncate(struct file *file) { return 0; -- cgit v1.2.3 From f09068b5a114ed28d2df2e82a7d30dde0145dc69 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:01 +0100 Subject: security: Introduce file_release hook In preparation for moving IMA and EVM to the LSM infrastructure, introduce the file_release hook. IMA calculates at file close the new digest of the file content and writes it to security.ima, so that appraisal at next file access succeeds. The new hook cannot return an error and cannot cause the operation to be reverted. Signed-off-by: Roberto Sassu Acked-by: Christian Brauner Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + include/linux/security.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 3c84942d2818..7f9e9240606e 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -173,6 +173,7 @@ LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir, struct kernfs_node *kn) LSM_HOOK(int, 0, file_permission, struct file *file, int mask) LSM_HOOK(int, 0, file_alloc_security, struct file *file) +LSM_HOOK(void, LSM_RET_VOID, file_release, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, unsigned long arg) diff --git a/include/linux/security.h b/include/linux/security.h index 97f2212c13b6..2997348afcb7 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -395,6 +395,7 @@ int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn); int security_file_permission(struct file *file, int mask); int security_file_alloc(struct file *file); +void security_file_release(struct file *file); void security_file_free(struct file *file); int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int security_file_ioctl_compat(struct file *file, unsigned int cmd, @@ -1008,6 +1009,9 @@ static inline int security_file_alloc(struct file *file) return 0; } +static inline void security_file_release(struct file *file) +{ } + static inline void security_file_free(struct file *file) { } -- cgit v1.2.3 From 08abce60d63fb55f440c393f4508e99064f2fd91 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:02 +0100 Subject: security: Introduce path_post_mknod hook In preparation for moving IMA and EVM to the LSM infrastructure, introduce the path_post_mknod hook. IMA-appraisal requires all existing files in policy to have a file hash/signature stored in security.ima. An exception is made for empty files created by mknod, by tagging them as new files. LSMs could also take some action after files are created. The new hook cannot return an error and cannot cause the operation to be reverted. Signed-off-by: Roberto Sassu Acked-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Christian Brauner Reviewed-by: Stefan Berger Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 7f9e9240606e..dba5d8204dc5 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -94,6 +94,8 @@ LSM_HOOK(int, 0, path_mkdir, const struct path *dir, struct dentry *dentry, LSM_HOOK(int, 0, path_rmdir, const struct path *dir, struct dentry *dentry) LSM_HOOK(int, 0, path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) +LSM_HOOK(void, LSM_RET_VOID, path_post_mknod, struct mnt_idmap *idmap, + struct dentry *dentry) LSM_HOOK(int, 0, path_truncate, const struct path *path) LSM_HOOK(int, 0, path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name) diff --git a/include/linux/security.h b/include/linux/security.h index 2997348afcb7..977dd9f7f51a 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1893,6 +1893,7 @@ int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t m int security_path_rmdir(const struct path *dir, struct dentry *dentry); int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev); +void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry); int security_path_truncate(const struct path *path); int security_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name); @@ -1927,6 +1928,10 @@ static inline int security_path_mknod(const struct path *dir, struct dentry *den return 0; } +static inline void security_path_post_mknod(struct mnt_idmap *idmap, + struct dentry *dentry) +{ } + static inline int security_path_truncate(const struct path *path) { return 0; -- cgit v1.2.3 From a7811e34d100acf24870eb949c5ae3e49dde18b9 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:03 +0100 Subject: security: Introduce inode_post_create_tmpfile hook In preparation for moving IMA and EVM to the LSM infrastructure, introduce the inode_post_create_tmpfile hook. As temp files can be made persistent, treat new temp files like other new files, so that the file hash is calculated and stored in the security xattr. LSMs could also take some action after temp files have been created. The new hook cannot return an error and cannot cause the operation to be canceled. Signed-off-by: Roberto Sassu Acked-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Christian Brauner Reviewed-by: Stefan Berger Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index dba5d8204dc5..87f60b47dfca 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -121,6 +121,8 @@ LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode, const struct qstr *name, const struct inode *context_inode) LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry, umode_t mode) +LSM_HOOK(void, LSM_RET_VOID, inode_post_create_tmpfile, struct mnt_idmap *idmap, + struct inode *inode) LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) LSM_HOOK(int, 0, inode_unlink, struct inode *dir, struct dentry *dentry) diff --git a/include/linux/security.h b/include/linux/security.h index 977dd9f7f51a..1cb604282617 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -344,6 +344,8 @@ int security_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode); int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode); +void security_inode_post_create_tmpfile(struct mnt_idmap *idmap, + struct inode *inode); int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry); int security_inode_unlink(struct inode *dir, struct dentry *dentry); @@ -811,6 +813,10 @@ static inline int security_inode_create(struct inode *dir, return 0; } +static inline void +security_inode_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode) +{ } + static inline int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) -- cgit v1.2.3 From 8b9d0b825c6573d654c8b8039ea79920926305c2 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:04 +0100 Subject: security: Introduce inode_post_set_acl hook In preparation for moving IMA and EVM to the LSM infrastructure, introduce the inode_post_set_acl hook. At inode_set_acl hook, EVM verifies the file's existing HMAC value. At inode_post_set_acl, EVM re-calculates the file's HMAC based on the modified POSIX ACL and other file metadata. Other LSMs could similarly take some action after successful POSIX ACL change. The new hook cannot return an error and cannot cause the operation to be reverted. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Acked-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Christian Brauner Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 87f60b47dfca..b0125c99f80a 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -157,6 +157,8 @@ LSM_HOOK(void, LSM_RET_VOID, inode_post_removexattr, struct dentry *dentry, const char *name) LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) +LSM_HOOK(void, LSM_RET_VOID, inode_post_set_acl, struct dentry *dentry, + const char *acl_name, struct posix_acl *kacl) LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_remove_acl, struct mnt_idmap *idmap, diff --git a/include/linux/security.h b/include/linux/security.h index 1cb604282617..c372797e1617 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -372,6 +372,8 @@ int security_inode_setxattr(struct mnt_idmap *idmap, int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); +void security_inode_post_set_acl(struct dentry *dentry, const char *acl_name, + struct posix_acl *kacl); int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int security_inode_remove_acl(struct mnt_idmap *idmap, @@ -915,6 +917,11 @@ static inline int security_inode_set_acl(struct mnt_idmap *idmap, return 0; } +static inline void security_inode_post_set_acl(struct dentry *dentry, + const char *acl_name, + struct posix_acl *kacl) +{ } + static inline int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) -- cgit v1.2.3 From 2d705d8024143c272a764320c880ccd3230bb699 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:05 +0100 Subject: security: Introduce inode_post_remove_acl hook In preparation for moving IMA and EVM to the LSM infrastructure, introduce the inode_post_remove_acl hook. At inode_remove_acl hook, EVM verifies the file's existing HMAC value. At inode_post_remove_acl, EVM re-calculates the file's HMAC with the passed POSIX ACL removed and other file metadata. Other LSMs could similarly take some action after successful POSIX ACL removal. The new hook cannot return an error and cannot cause the operation to be reverted. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Acked-by: Casey Schaufler Reviewed-by: Mimi Zohar Acked-by: Christian Brauner Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index b0125c99f80a..7e414ba26333 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -163,6 +163,8 @@ LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_remove_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) +LSM_HOOK(void, LSM_RET_VOID, inode_post_remove_acl, struct mnt_idmap *idmap, + struct dentry *dentry, const char *acl_name) LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry) LSM_HOOK(int, 0, inode_killpriv, struct mnt_idmap *idmap, struct dentry *dentry) diff --git a/include/linux/security.h b/include/linux/security.h index c372797e1617..4b03c76b91f1 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -378,6 +378,9 @@ int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); +void security_inode_post_remove_acl(struct mnt_idmap *idmap, + struct dentry *dentry, + const char *acl_name); void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int security_inode_getxattr(struct dentry *dentry, const char *name); @@ -936,6 +939,11 @@ static inline int security_inode_remove_acl(struct mnt_idmap *idmap, return 0; } +static inline void security_inode_post_remove_acl(struct mnt_idmap *idmap, + struct dentry *dentry, + const char *acl_name) +{ } + static inline void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { } -- cgit v1.2.3 From b8d997032a46fcf47d5bda011c0d1e87b20c08ba Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:06 +0100 Subject: security: Introduce key_post_create_or_update hook In preparation for moving IMA and EVM to the LSM infrastructure, introduce the key_post_create_or_update hook. Depending on policy, IMA measures the key content after creation or update, so that remote verifiers are aware of the operation. Other LSMs could similarly take some action after successful key creation or update. The new hook cannot return an error and cannot cause the operation to be reverted. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Acked-by: Casey Schaufler Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 +++ include/linux/security.h | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 7e414ba26333..7e4683035d34 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -405,6 +405,9 @@ LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key) LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer) +LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring, + struct key *key, const void *payload, size_t payload_len, + unsigned long flags, bool create) #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT diff --git a/include/linux/security.h b/include/linux/security.h index 4b03c76b91f1..8436f9abf43d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2004,6 +2004,9 @@ void security_key_free(struct key *key); int security_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm); int security_key_getsecurity(struct key *key, char **_buffer); +void security_key_post_create_or_update(struct key *keyring, struct key *key, + const void *payload, size_t payload_len, + unsigned long flags, bool create); #else @@ -2031,6 +2034,14 @@ static inline int security_key_getsecurity(struct key *key, char **_buffer) return 0; } +static inline void security_key_post_create_or_update(struct key *keyring, + struct key *key, + const void *payload, + size_t payload_len, + unsigned long flags, + bool create) +{ } + #endif #endif /* CONFIG_KEYS */ -- cgit v1.2.3 From 06cca5110774f7b59a1685431ac697865588f4ca Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:07 +0100 Subject: integrity: Move integrity_kernel_module_request() to IMA In preparation for removing the 'integrity' LSM, move integrity_kernel_module_request() to IMA, and rename it to ima_kernel_module_request(). Rewrite the function documentation, to explain better what the problem is. Compile it conditionally if CONFIG_INTEGRITY_ASYMMETRIC_KEYS is enabled, and call it from security.c (removed afterwards with the move of IMA to the LSM infrastructure). Adding this hook cannot be avoided, since IMA has no control on the flags passed to crypto_alloc_sig() in public_key_verify_signature(), and thus cannot pass CRYPTO_NOLOAD, which solved the problem for EVM hashing with commit e2861fa71641 ("evm: Don't deadlock if a crypto algorithm is unavailable"). EVM alone does not need to implement this hook, first because there is no mutex to deadlock, and second because even if it had it, there should be a recursive call. However, since verification from EVM can be initiated only by setting inode metadata, deadlock would occur if modprobe would do the same while loading a kernel module (which is unlikely). Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/ima.h | 10 ++++++++++ include/linux/integrity.h | 13 ------------- 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 31ef6c3c3207..0f9af283cbc8 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -256,4 +256,14 @@ static inline bool ima_appraise_signature(enum kernel_read_file_id func) return false; } #endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */ + +#if defined(CONFIG_IMA) && defined(CONFIG_INTEGRITY_ASYMMETRIC_KEYS) +extern int ima_kernel_module_request(char *kmod_name); +#else +static inline int ima_kernel_module_request(char *kmod_name) +{ + return 0; +} + +#endif #endif /* _LINUX_IMA_H */ diff --git a/include/linux/integrity.h b/include/linux/integrity.h index 2ea0f2f65ab6..ef0f63ef5ebc 100644 --- a/include/linux/integrity.h +++ b/include/linux/integrity.h @@ -42,17 +42,4 @@ static inline void integrity_load_keys(void) } #endif /* CONFIG_INTEGRITY */ -#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS - -extern int integrity_kernel_module_request(char *kmod_name); - -#else - -static inline int integrity_kernel_module_request(char *kmod_name) -{ - return 0; -} - -#endif /* CONFIG_INTEGRITY_ASYMMETRIC_KEYS */ - #endif /* _LINUX_INTEGRITY_H */ -- cgit v1.2.3 From cd3cec0a02c7338ce2901c574f3935b8f6984aab Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:08 +0100 Subject: ima: Move to LSM infrastructure Move hardcoded IMA function calls (not appraisal-specific functions) from various places in the kernel to the LSM infrastructure, by introducing a new LSM named 'ima' (at the end of the LSM list and always enabled like 'integrity'). Having IMA before EVM in the Makefile is sufficient to preserve the relative order of the new 'ima' LSM in respect to the upcoming 'evm' LSM, and thus the order of IMA and EVM function calls as when they were hardcoded. Make moved functions as static (except ima_post_key_create_or_update(), which is not in ima_main.c), and register them as implementation of the respective hooks in the new function init_ima_lsm(). Select CONFIG_SECURITY_PATH, to ensure that the path-based LSM hook path_post_mknod is always available and ima_post_path_mknod() is always executed to mark files as new, as before the move. A slight difference is that IMA and EVM functions registered for the inode_post_setattr, inode_post_removexattr, path_post_mknod, inode_post_create_tmpfile, inode_post_set_acl and inode_post_remove_acl won't be executed for private inodes. Since those inodes are supposed to be fs-internal, they should not be of interest to IMA or EVM. The S_PRIVATE flag is used for anonymous inodes, hugetlbfs, reiserfs xattrs, XFS scrub and kernel-internal tmpfs files. Conditionally register ima_post_key_create_or_update() if CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS is enabled. Also, conditionally register ima_kernel_module_request() if CONFIG_INTEGRITY_ASYMMETRIC_KEYS is enabled. Finally, add the LSM_ID_IMA case in lsm_list_modules_test.c. Signed-off-by: Roberto Sassu Acked-by: Chuck Lever Acked-by: Casey Schaufler Acked-by: Christian Brauner Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/ima.h | 104 ---------------------------------------------------- 1 file changed, 104 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 0f9af283cbc8..23ae24b60ecf 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -16,24 +16,6 @@ struct linux_binprm; #ifdef CONFIG_IMA extern enum hash_algo ima_get_current_hash_algo(void); -extern int ima_bprm_check(struct linux_binprm *bprm); -extern int ima_file_check(struct file *file, int mask); -extern void ima_post_create_tmpfile(struct mnt_idmap *idmap, - struct inode *inode); -extern void ima_file_free(struct file *file); -extern int ima_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags); -extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, - unsigned long prot); -extern int ima_load_data(enum kernel_load_data_id id, bool contents); -extern int ima_post_load_data(char *buf, loff_t size, - enum kernel_load_data_id id, char *description); -extern int ima_read_file(struct file *file, enum kernel_read_file_id id, - bool contents); -extern int ima_post_read_file(struct file *file, char *buf, loff_t size, - enum kernel_read_file_id id); -extern void ima_post_path_mknod(struct mnt_idmap *idmap, - struct dentry *dentry); extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size); extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size); @@ -58,68 +40,6 @@ static inline enum hash_algo ima_get_current_hash_algo(void) return HASH_ALGO__LAST; } -static inline int ima_bprm_check(struct linux_binprm *bprm) -{ - return 0; -} - -static inline int ima_file_check(struct file *file, int mask) -{ - return 0; -} - -static inline void ima_post_create_tmpfile(struct mnt_idmap *idmap, - struct inode *inode) -{ -} - -static inline void ima_file_free(struct file *file) -{ - return; -} - -static inline int ima_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags) -{ - return 0; -} - -static inline int ima_file_mprotect(struct vm_area_struct *vma, - unsigned long reqprot, unsigned long prot) -{ - return 0; -} - -static inline int ima_load_data(enum kernel_load_data_id id, bool contents) -{ - return 0; -} - -static inline int ima_post_load_data(char *buf, loff_t size, - enum kernel_load_data_id id, - char *description) -{ - return 0; -} - -static inline int ima_read_file(struct file *file, enum kernel_read_file_id id, - bool contents) -{ - return 0; -} - -static inline int ima_post_read_file(struct file *file, char *buf, loff_t size, - enum kernel_read_file_id id) -{ - return 0; -} - -static inline void ima_post_path_mknod(struct mnt_idmap *idmap, - struct dentry *dentry) -{ - return; -} - static inline int ima_file_hash(struct file *file, char *buf, size_t buf_size) { return -EOPNOTSUPP; @@ -170,20 +90,6 @@ static inline void ima_add_kexec_buffer(struct kimage *image) {} #endif -#ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS -extern void ima_post_key_create_or_update(struct key *keyring, - struct key *key, - const void *payload, size_t plen, - unsigned long flags, bool create); -#else -static inline void ima_post_key_create_or_update(struct key *keyring, - struct key *key, - const void *payload, - size_t plen, - unsigned long flags, - bool create) {} -#endif /* CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS */ - #ifdef CONFIG_IMA_APPRAISE extern bool is_ima_appraise_enabled(void); extern void ima_inode_post_setattr(struct mnt_idmap *idmap, @@ -256,14 +162,4 @@ static inline bool ima_appraise_signature(enum kernel_read_file_id func) return false; } #endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */ - -#if defined(CONFIG_IMA) && defined(CONFIG_INTEGRITY_ASYMMETRIC_KEYS) -extern int ima_kernel_module_request(char *kmod_name); -#else -static inline int ima_kernel_module_request(char *kmod_name) -{ - return 0; -} - -#endif #endif /* _LINUX_IMA_H */ -- cgit v1.2.3 From 84594c9ecdca7ca595bc50e315093cb76921fd8e Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:09 +0100 Subject: ima: Move IMA-Appraisal to LSM infrastructure A few additional IMA hooks are needed to reset the cached appraisal status, causing the file's integrity to be re-evaluated on next access. Register these IMA-appraisal only functions separately from the rest of IMA functions, as appraisal is a separate feature not necessarily enabled in the kernel configuration. Reuse the same approach as for other IMA functions, move hardcoded calls from various places in the kernel to the LSM infrastructure. Declare the functions as static and register them as hook implementations in init_ima_appraise_lsm(), called by init_ima_lsm(). Also move the inline function ima_inode_remove_acl() from the public ima.h header to ima_appraise.c. Signed-off-by: Roberto Sassu Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler Acked-by: Christian Brauner Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/ima.h | 55 ----------------------------------------------------- 1 file changed, 55 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 23ae24b60ecf..0bae61a15b60 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -92,66 +92,11 @@ static inline void ima_add_kexec_buffer(struct kimage *image) #ifdef CONFIG_IMA_APPRAISE extern bool is_ima_appraise_enabled(void); -extern void ima_inode_post_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, int ia_valid); -extern int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, - const char *xattr_name, const void *xattr_value, - size_t xattr_value_len, int flags); -extern int ima_inode_set_acl(struct mnt_idmap *idmap, - struct dentry *dentry, const char *acl_name, - struct posix_acl *kacl); -static inline int ima_inode_remove_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *acl_name) -{ - return ima_inode_set_acl(idmap, dentry, acl_name, NULL); -} - -extern int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, - const char *xattr_name); #else static inline bool is_ima_appraise_enabled(void) { return 0; } - -static inline void ima_inode_post_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, int ia_valid) -{ - return; -} - -static inline int ima_inode_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *xattr_name, - const void *xattr_value, - size_t xattr_value_len, - int flags) -{ - return 0; -} - -static inline int ima_inode_set_acl(struct mnt_idmap *idmap, - struct dentry *dentry, const char *acl_name, - struct posix_acl *kacl) -{ - - return 0; -} - -static inline int ima_inode_removexattr(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *xattr_name) -{ - return 0; -} - -static inline int ima_inode_remove_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *acl_name) -{ - return 0; -} #endif /* CONFIG_IMA_APPRAISE */ #if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING) -- cgit v1.2.3 From 9238311176115aac1b1a86e8e968c04ebec747a1 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:10 +0100 Subject: evm: Move to LSM infrastructure As for IMA, move hardcoded EVM function calls from various places in the kernel to the LSM infrastructure, by introducing a new LSM named 'evm' (last and always enabled like 'ima'). The order in the Makefile ensures that 'evm' hooks are executed after 'ima' ones. Make EVM functions as static (except for evm_inode_init_security(), which is exported), and register them as hook implementations in init_evm_lsm(). Also move the inline functions evm_inode_remove_acl(), evm_inode_post_remove_acl(), and evm_inode_post_set_acl() from the public evm.h header to evm_main.c. Unlike before (see commit to move IMA to the LSM infrastructure), evm_inode_post_setattr(), evm_inode_post_set_acl(), evm_inode_post_remove_acl(), and evm_inode_post_removexattr() are not executed for private inodes. Finally, add the LSM_ID_EVM case in lsm_list_modules_test.c Signed-off-by: Roberto Sassu Reviewed-by: Casey Schaufler Acked-by: Christian Brauner Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/evm.h | 113 ---------------------------------------------------- 1 file changed, 113 deletions(-) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index 3faabdd47852..cb481eccc967 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -21,45 +21,6 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry, void *xattr_value, size_t xattr_value_len, struct integrity_iint_cache *iint); -extern int evm_inode_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr); -extern void evm_inode_post_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, int ia_valid); -extern int evm_inode_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); -extern void evm_inode_post_setxattr(struct dentry *dentry, - const char *xattr_name, - const void *xattr_value, - size_t xattr_value_len, - int flags); -extern int evm_inode_copy_up_xattr(const char *name); -extern int evm_inode_removexattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *xattr_name); -extern void evm_inode_post_removexattr(struct dentry *dentry, - const char *xattr_name); -static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *acl_name) -{ - evm_inode_post_removexattr(dentry, acl_name); -} -extern int evm_inode_set_acl(struct mnt_idmap *idmap, - struct dentry *dentry, const char *acl_name, - struct posix_acl *kacl); -static inline int evm_inode_remove_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *acl_name) -{ - return evm_inode_set_acl(idmap, dentry, acl_name, NULL); -} -static inline void evm_inode_post_set_acl(struct dentry *dentry, - const char *acl_name, - struct posix_acl *kacl) -{ - return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0); -} - int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count); @@ -94,80 +55,6 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, } #endif -static inline int evm_inode_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr) -{ - return 0; -} - -static inline void evm_inode_post_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, int ia_valid) -{ - return; -} - -static inline int evm_inode_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - return 0; -} - -static inline void evm_inode_post_setxattr(struct dentry *dentry, - const char *xattr_name, - const void *xattr_value, - size_t xattr_value_len, - int flags) -{ - return; -} - -static inline int evm_inode_copy_up_xattr(const char *name) -{ - return 0; -} - -static inline int evm_inode_removexattr(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *xattr_name) -{ - return 0; -} - -static inline void evm_inode_post_removexattr(struct dentry *dentry, - const char *xattr_name) -{ - return; -} - -static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *acl_name) -{ - return; -} - -static inline int evm_inode_set_acl(struct mnt_idmap *idmap, - struct dentry *dentry, const char *acl_name, - struct posix_acl *kacl) -{ - return 0; -} - -static inline int evm_inode_remove_acl(struct mnt_idmap *idmap, - struct dentry *dentry, - const char *acl_name) -{ - return 0; -} - -static inline void evm_inode_post_set_acl(struct dentry *dentry, - const char *acl_name, - struct posix_acl *kacl) -{ - return; -} - static inline int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, -- cgit v1.2.3 From 75a323e604fc77c50c7ef2af6f0eeef221637642 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:11 +0100 Subject: evm: Make it independent from 'integrity' LSM Define a new structure for EVM-specific metadata, called evm_iint_cache, and embed it in the inode security blob. Introduce evm_iint_inode() to retrieve metadata, and register evm_inode_alloc_security() for the inode_alloc_security LSM hook, to initialize the structure (before splitting metadata, this task was done by iint_init_always()). Keep the non-NULL checks after calling evm_iint_inode() except in evm_inode_alloc_security(), to take into account inodes for which security_inode_alloc() was not called. When using shared metadata, obtaining a NULL pointer from integrity_iint_find() meant that the file wasn't in the IMA policy. Now, because IMA and EVM use disjoint metadata, the EVM status has to be stored for every inode regardless of the IMA policy. Given that from now on EVM relies on its own metadata, remove the iint parameter from evm_verifyxattr(). Also, directly retrieve the iint in evm_verify_hmac(), called by both evm_verifyxattr() and evm_verify_current_integrity(), since now there is no performance penalty in retrieving EVM metadata (constant time). Replicate the management of the IMA_NEW_FILE flag, by introducing evm_post_path_mknod() and evm_file_release() to respectively set and clear the newly introduced flag EVM_NEW_FILE, at the same time IMA does. Like for IMA, select CONFIG_SECURITY_PATH when EVM is enabled, to ensure that files are marked as new. Unlike ima_post_path_mknod(), evm_post_path_mknod() cannot check if a file must be appraised. Thus, it marks all affected files. Also, it does not clear EVM_NEW_FILE depending on i_version, but that is not a problem because IMA_NEW_FILE is always cleared when set in ima_check_last_writer(). Move the EVM-specific flag EVM_IMMUTABLE_DIGSIG to security/integrity/evm/evm.h, since that definition is now unnecessary in the common integrity layer. Finally, switch to the LSM reservation mechanism for the EVM xattr, and consequently decrement by one the number of xattrs to allocate in security_inode_init_security(). Signed-off-by: Roberto Sassu Reviewed-by: Casey Schaufler Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/evm.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index cb481eccc967..d48d6da32315 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -12,15 +12,12 @@ #include #include -struct integrity_iint_cache; - #ifdef CONFIG_EVM extern int evm_set_key(void *key, size_t keylen); extern enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, - size_t xattr_value_len, - struct integrity_iint_cache *iint); + size_t xattr_value_len); int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count); @@ -48,8 +45,7 @@ static inline int evm_set_key(void *key, size_t keylen) static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, - size_t xattr_value_len, - struct integrity_iint_cache *iint) + size_t xattr_value_len) { return INTEGRITY_UNKNOWN; } -- cgit v1.2.3 From b6c0dec9f78bc691166d60347b52b3adf03d5875 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 15 Feb 2024 11:31:13 +0100 Subject: integrity: Remove LSM Since now IMA and EVM use their own integrity metadata, it is safe to remove the 'integrity' LSM, with its management of integrity metadata. Keep the iint.c file only for loading IMA and EVM keys at boot, and for creating the integrity directory in securityfs (we need to keep it for retrocompatibility reasons). Signed-off-by: Roberto Sassu Reviewed-by: Casey Schaufler Reviewed-by: Stefan Berger Reviewed-by: Mimi Zohar Acked-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/integrity.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/integrity.h b/include/linux/integrity.h index ef0f63ef5ebc..459b79683783 100644 --- a/include/linux/integrity.h +++ b/include/linux/integrity.h @@ -19,24 +19,10 @@ enum integrity_status { INTEGRITY_UNKNOWN, }; -/* List of EVM protected security xattrs */ #ifdef CONFIG_INTEGRITY -extern struct integrity_iint_cache *integrity_inode_get(struct inode *inode); -extern void integrity_inode_free(struct inode *inode); extern void __init integrity_load_keys(void); #else -static inline struct integrity_iint_cache * - integrity_inode_get(struct inode *inode) -{ - return NULL; -} - -static inline void integrity_inode_free(struct inode *inode) -{ - return; -} - static inline void integrity_load_keys(void) { } -- cgit v1.2.3 From 81a7d0c4d059cb5c122110acbeec7bedfb91a741 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 13 Feb 2024 11:36:56 -0300 Subject: soundwire: bus_type: make sdw_bus_type const Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the sdw_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: "Ricardo B. Marliere" Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240213-bus_cleanup-soundwire-v1-1-3878b00f6f57@marliere.net Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_type.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_type.h b/include/linux/soundwire/sdw_type.h index d8c27f1e5559..b445f7200f06 100644 --- a/include/linux/soundwire/sdw_type.h +++ b/include/linux/soundwire/sdw_type.h @@ -4,7 +4,7 @@ #ifndef __SOUNDWIRE_TYPES_H #define __SOUNDWIRE_TYPES_H -extern struct bus_type sdw_bus_type; +extern const struct bus_type sdw_bus_type; extern struct device_type sdw_slave_type; extern struct device_type sdw_master_type; -- cgit v1.2.3 From 00a9bc6070434814d39118a0de70c1645f64bf60 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:12 +0800 Subject: iommu: Move iommu fault data to linux/iommu.h The iommu fault data is currently defined in uapi/linux/iommu.h, but is only used inside the iommu subsystem. Move it to linux/iommu.h, where it will be more accessible to kernel drivers. With this done, uapi/linux/iommu.h becomes empty and can be removed from the tree. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 152 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1ea2a820e1eb..472a8ce029b1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -14,7 +14,6 @@ #include #include #include -#include #define IOMMU_READ (1 << 0) #define IOMMU_WRITE (1 << 1) @@ -44,6 +43,157 @@ struct iommu_sva; struct iommu_fault_event; struct iommu_dma_cookie; +#define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ +#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ +#define IOMMU_FAULT_PERM_EXEC (1 << 2) /* exec */ +#define IOMMU_FAULT_PERM_PRIV (1 << 3) /* privileged */ + +/* Generic fault types, can be expanded IRQ remapping fault */ +enum iommu_fault_type { + IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ + IOMMU_FAULT_PAGE_REQ, /* page request fault */ +}; + +enum iommu_fault_reason { + IOMMU_FAULT_REASON_UNKNOWN = 0, + + /* Could not access the PASID table (fetch caused external abort) */ + IOMMU_FAULT_REASON_PASID_FETCH, + + /* PASID entry is invalid or has configuration errors */ + IOMMU_FAULT_REASON_BAD_PASID_ENTRY, + + /* + * PASID is out of range (e.g. exceeds the maximum PASID + * supported by the IOMMU) or disabled. + */ + IOMMU_FAULT_REASON_PASID_INVALID, + + /* + * An external abort occurred fetching (or updating) a translation + * table descriptor + */ + IOMMU_FAULT_REASON_WALK_EABT, + + /* + * Could not access the page table entry (Bad address), + * actual translation fault + */ + IOMMU_FAULT_REASON_PTE_FETCH, + + /* Protection flag check failed */ + IOMMU_FAULT_REASON_PERMISSION, + + /* access flag check failed */ + IOMMU_FAULT_REASON_ACCESS, + + /* Output address of a translation stage caused Address Size fault */ + IOMMU_FAULT_REASON_OOR_ADDRESS, +}; + +/** + * struct iommu_fault_unrecoverable - Unrecoverable fault data + * @reason: reason of the fault, from &enum iommu_fault_reason + * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) + * @pasid: Process Address Space ID + * @perm: requested permission access using by the incoming transaction + * (IOMMU_FAULT_PERM_* values) + * @addr: offending page address + * @fetch_addr: address that caused a fetch abort, if any + */ +struct iommu_fault_unrecoverable { + __u32 reason; +#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) +#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) +#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) + __u32 flags; + __u32 pasid; + __u32 perm; + __u64 addr; + __u64 fetch_addr; +}; + +/** + * struct iommu_fault_page_request - Page Request data + * @flags: encodes whether the corresponding fields are valid and whether this + * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values). + * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response + * must have the same PASID value as the page request. When it is clear, + * the page response should not have a PASID. + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @perm: requested page permissions (IOMMU_FAULT_PERM_* values) + * @addr: page address + * @private_data: device-specific private information + */ +struct iommu_fault_page_request { +#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID (1 << 0) +#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) +#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) +#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) + __u32 flags; + __u32 pasid; + __u32 grpid; + __u32 perm; + __u64 addr; + __u64 private_data[2]; +}; + +/** + * struct iommu_fault - Generic fault data + * @type: fault type from &enum iommu_fault_type + * @padding: reserved for future use (should be zero) + * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV + * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ + * @padding2: sets the fault size to allow for future extensions + */ +struct iommu_fault { + __u32 type; + __u32 padding; + union { + struct iommu_fault_unrecoverable event; + struct iommu_fault_page_request prm; + __u8 padding2[56]; + }; +}; + +/** + * enum iommu_page_response_code - Return status of fault handlers + * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables + * populated, retry the access. This is "Success" in PCI PRI. + * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from + * this device if possible. This is "Response Failure" in PCI PRI. + * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the + * access. This is "Invalid Request" in PCI PRI. + */ +enum iommu_page_response_code { + IOMMU_PAGE_RESP_SUCCESS = 0, + IOMMU_PAGE_RESP_INVALID, + IOMMU_PAGE_RESP_FAILURE, +}; + +/** + * struct iommu_page_response - Generic page response information + * @argsz: User filled size of this data + * @version: API version of this structure + * @flags: encodes whether the corresponding fields are valid + * (IOMMU_FAULT_PAGE_RESPONSE_* values) + * @pasid: Process Address Space ID + * @grpid: Page Request Group Index + * @code: response code from &enum iommu_page_response_code + */ +struct iommu_page_response { + __u32 argsz; +#define IOMMU_PAGE_RESP_VERSION_1 1 + __u32 version; +#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) + __u32 flags; + __u32 pasid; + __u32 grpid; + __u32 code; +}; + + /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 #define IOMMU_FAULT_WRITE 0x1 -- cgit v1.2.3 From 0edeab66eba88947dabe8634a3efd136cc771750 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:14 +0800 Subject: iommu: Remove unrecoverable fault data The unrecoverable fault data is not used anywhere. Remove it to avoid dead code. Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 72 ++------------------------------------------------- 1 file changed, 2 insertions(+), 70 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 472a8ce029b1..c960c4fae3bc 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -50,67 +50,7 @@ struct iommu_dma_cookie; /* Generic fault types, can be expanded IRQ remapping fault */ enum iommu_fault_type { - IOMMU_FAULT_DMA_UNRECOV = 1, /* unrecoverable fault */ - IOMMU_FAULT_PAGE_REQ, /* page request fault */ -}; - -enum iommu_fault_reason { - IOMMU_FAULT_REASON_UNKNOWN = 0, - - /* Could not access the PASID table (fetch caused external abort) */ - IOMMU_FAULT_REASON_PASID_FETCH, - - /* PASID entry is invalid or has configuration errors */ - IOMMU_FAULT_REASON_BAD_PASID_ENTRY, - - /* - * PASID is out of range (e.g. exceeds the maximum PASID - * supported by the IOMMU) or disabled. - */ - IOMMU_FAULT_REASON_PASID_INVALID, - - /* - * An external abort occurred fetching (or updating) a translation - * table descriptor - */ - IOMMU_FAULT_REASON_WALK_EABT, - - /* - * Could not access the page table entry (Bad address), - * actual translation fault - */ - IOMMU_FAULT_REASON_PTE_FETCH, - - /* Protection flag check failed */ - IOMMU_FAULT_REASON_PERMISSION, - - /* access flag check failed */ - IOMMU_FAULT_REASON_ACCESS, - - /* Output address of a translation stage caused Address Size fault */ - IOMMU_FAULT_REASON_OOR_ADDRESS, -}; - -/** - * struct iommu_fault_unrecoverable - Unrecoverable fault data - * @reason: reason of the fault, from &enum iommu_fault_reason - * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values) - * @pasid: Process Address Space ID - * @perm: requested permission access using by the incoming transaction - * (IOMMU_FAULT_PERM_* values) - * @addr: offending page address - * @fetch_addr: address that caused a fetch abort, if any - */ -struct iommu_fault_unrecoverable { - __u32 reason; -#define IOMMU_FAULT_UNRECOV_PASID_VALID (1 << 0) -#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1) -#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID (1 << 2) - __u32 flags; - __u32 pasid; - __u32 perm; - __u64 addr; - __u64 fetch_addr; + IOMMU_FAULT_PAGE_REQ = 1, /* page request fault */ }; /** @@ -142,19 +82,11 @@ struct iommu_fault_page_request { /** * struct iommu_fault - Generic fault data * @type: fault type from &enum iommu_fault_type - * @padding: reserved for future use (should be zero) - * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ - * @padding2: sets the fault size to allow for future extensions */ struct iommu_fault { __u32 type; - __u32 padding; - union { - struct iommu_fault_unrecoverable event; - struct iommu_fault_page_request prm; - __u8 padding2[56]; - }; + struct iommu_fault_page_request prm; }; /** -- cgit v1.2.3 From 8b32a3bea2629049c484f595af7aad797e24453e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:15 +0800 Subject: iommu: Cleanup iopf data structure definitions struct iommu_fault_page_request and struct iommu_page_response are not part of uAPI anymore. Convert them to data structures for kAPI. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c960c4fae3bc..829bcb5a8e23 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -71,12 +71,12 @@ struct iommu_fault_page_request { #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1) #define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2) #define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID (1 << 3) - __u32 flags; - __u32 pasid; - __u32 grpid; - __u32 perm; - __u64 addr; - __u64 private_data[2]; + u32 flags; + u32 pasid; + u32 grpid; + u32 perm; + u64 addr; + u64 private_data[2]; }; /** @@ -85,7 +85,7 @@ struct iommu_fault_page_request { * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ */ struct iommu_fault { - __u32 type; + u32 type; struct iommu_fault_page_request prm; }; @@ -106,8 +106,6 @@ enum iommu_page_response_code { /** * struct iommu_page_response - Generic page response information - * @argsz: User filled size of this data - * @version: API version of this structure * @flags: encodes whether the corresponding fields are valid * (IOMMU_FAULT_PAGE_RESPONSE_* values) * @pasid: Process Address Space ID @@ -115,14 +113,11 @@ enum iommu_page_response_code { * @code: response code from &enum iommu_page_response_code */ struct iommu_page_response { - __u32 argsz; -#define IOMMU_PAGE_RESP_VERSION_1 1 - __u32 version; #define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) - __u32 flags; - __u32 pasid; - __u32 grpid; - __u32 code; + u32 flags; + u32 pasid; + u32 grpid; + u32 code; }; -- cgit v1.2.3 From 15fc60cdd2d236a73b32c99d21fc0f7b7ce6cbbb Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:16 +0800 Subject: iommu: Merge iopf_device_param into iommu_fault_param The struct dev_iommu contains two pointers, fault_param and iopf_param. The fault_param pointer points to a data structure that is used to store pending faults that are awaiting responses. The iopf_param pointer points to a data structure that is used to store partial faults that are part of a Page Request Group. The fault_param and iopf_param pointers are essentially duplicate. This causes memory waste. Merge the iopf_device_param pointer into the iommu_fault_param pointer to consolidate the code and save memory. The consolidated pointer would be allocated on demand when the device driver enables the iopf on device, and would be freed after iopf is disabled. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 829bcb5a8e23..bbb7c2ad5184 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -42,6 +42,7 @@ struct notifier_block; struct iommu_sva; struct iommu_fault_event; struct iommu_dma_cookie; +struct iopf_queue; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -672,21 +673,31 @@ struct iommu_fault_event { * struct iommu_fault_param - per-device IOMMU fault data * @handler: Callback function to handle IOMMU faults at device level * @data: handler private data - * @faults: holds the pending faults which needs response * @lock: protect pending faults list + * @dev: the device that owns this param + * @queue: IOPF queue + * @queue_list: index into queue->devices + * @partial: faults that are part of a Page Request Group for which the last + * request hasn't been submitted yet. + * @faults: holds the pending faults which need response */ struct iommu_fault_param { iommu_dev_fault_handler_t handler; void *data; - struct list_head faults; struct mutex lock; + + struct device *dev; + struct iopf_queue *queue; + struct list_head queue_list; + + struct list_head partial; + struct list_head faults; }; /** * struct dev_iommu - Collection of per-device IOMMU data * * @fault_param: IOMMU detected device fault reporting data - * @iopf_param: I/O Page Fault queue and data * @fwspec: IOMMU fwspec data * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data @@ -702,7 +713,6 @@ struct iommu_fault_param { struct dev_iommu { struct mutex lock; struct iommu_fault_param *fault_param; - struct iopf_device_param *iopf_param; struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void *priv; -- cgit v1.2.3 From 1ff25d798e52943d037accf15c675a6845d9776f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:17 +0800 Subject: iommu: Remove iommu_[un]register_device_fault_handler() The individual iommu driver reports the iommu page faults by calling iommu_report_device_fault(), where a pre-registered device fault handler is called to route the fault to another fault handler installed on the corresponding iommu domain. The pre-registered device fault handler is static and won't be dynamic as the fault handler is eventually per iommu domain. Replace calling device fault handler with iommu_queue_iopf(). After this replacement, the registering and unregistering fault handler interfaces are not needed anywhere. Remove the interfaces and the related data structures to avoid dead code. Convert cookie parameter of iommu_queue_iopf() into a device pointer that is really passed. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bbb7c2ad5184..70176c1c5573 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -128,7 +128,6 @@ struct iommu_page_response { typedef int (*iommu_fault_handler_t)(struct iommu_domain *, struct device *, unsigned long, int, void *); -typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *); struct iommu_domain_geometry { dma_addr_t aperture_start; /* First address that can be mapped */ @@ -671,8 +670,6 @@ struct iommu_fault_event { /** * struct iommu_fault_param - per-device IOMMU fault data - * @handler: Callback function to handle IOMMU faults at device level - * @data: handler private data * @lock: protect pending faults list * @dev: the device that owns this param * @queue: IOPF queue @@ -682,8 +679,6 @@ struct iommu_fault_event { * @faults: holds the pending faults which need response */ struct iommu_fault_param { - iommu_dev_fault_handler_t handler; - void *data; struct mutex lock; struct device *dev; @@ -806,11 +801,6 @@ extern int iommu_group_for_each_dev(struct iommu_group *group, void *data, extern struct iommu_group *iommu_group_get(struct device *dev); extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); -extern int iommu_register_device_fault_handler(struct device *dev, - iommu_dev_fault_handler_t handler, - void *data); - -extern int iommu_unregister_device_fault_handler(struct device *dev); extern int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt); @@ -1222,19 +1212,6 @@ static inline void iommu_group_put(struct iommu_group *group) { } -static inline -int iommu_register_device_fault_handler(struct device *dev, - iommu_dev_fault_handler_t handler, - void *data) -{ - return -ENODEV; -} - -static inline int iommu_unregister_device_fault_handler(struct device *dev) -{ - return 0; -} - static inline int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) { -- cgit v1.2.3 From 3f02a9dc70007c0e6299fda9c4f7a1e2277ec3d2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:18 +0800 Subject: iommu: Merge iommu_fault_event and iopf_fault The iommu_fault_event and iopf_fault data structures store the same information about an iopf fault. They are also used in the same way. Merge these two data structures into a single one to make the code more concise and easier to maintain. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 70176c1c5573..2320548a90f8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -40,7 +40,6 @@ struct iommu_domain_ops; struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; -struct iommu_fault_event; struct iommu_dma_cookie; struct iopf_queue; @@ -121,6 +120,11 @@ struct iommu_page_response { u32 code; }; +struct iopf_fault { + struct iommu_fault fault; + /* node for pending lists */ + struct list_head list; +}; /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 @@ -553,7 +557,7 @@ struct iommu_ops { int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); int (*page_response)(struct device *dev, - struct iommu_fault_event *evt, + struct iopf_fault *evt, struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); @@ -654,20 +658,6 @@ struct iommu_device { u32 max_pasids; }; -/** - * struct iommu_fault_event - Generic fault event - * - * Can represent recoverable faults such as a page requests or - * unrecoverable faults such as DMA or IRQ remapping faults. - * - * @fault: fault descriptor - * @list: pending fault event list, used for tracking responses - */ -struct iommu_fault_event { - struct iommu_fault fault; - struct list_head list; -}; - /** * struct iommu_fault_param - per-device IOMMU fault data * @lock: protect pending faults list @@ -802,8 +792,7 @@ extern struct iommu_group *iommu_group_get(struct device *dev); extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); -extern int iommu_report_device_fault(struct device *dev, - struct iommu_fault_event *evt); +extern int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); extern int iommu_page_response(struct device *dev, struct iommu_page_response *msg); @@ -1213,7 +1202,7 @@ static inline void iommu_group_put(struct iommu_group *group) } static inline -int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { return -ENODEV; } -- cgit v1.2.3 From 24b5d268b5ab95c12b5ae58a054d04bfa442f58f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:19 +0800 Subject: iommu: Prepare for separating SVA and IOPF Move iopf_group data structure to iommu.h to make it a minimal set of faults that a domain's page fault handler should handle. Add a new function, iopf_free_group(), to free a fault group after all faults in the group are handled. This function will be made global so that it can be called from other files, such as iommu-sva.c. Move iopf_queue data structure to iommu.h to allow the workqueue to be scheduled out of this file. This will simplify the sequential patches. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2320548a90f8..c9d4f175f121 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -41,7 +41,6 @@ struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; -struct iopf_queue; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -126,6 +125,25 @@ struct iopf_fault { struct list_head list; }; +struct iopf_group { + struct iopf_fault last_fault; + struct list_head faults; + struct work_struct work; + struct device *dev; +}; + +/** + * struct iopf_queue - IO Page Fault queue + * @wq: the fault workqueue + * @devices: devices attached to this queue + * @lock: protects the device list + */ +struct iopf_queue { + struct workqueue_struct *wq; + struct list_head devices; + struct mutex lock; +}; + /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 #define IOMMU_FAULT_WRITE 0x1 -- cgit v1.2.3 From 351ffcb11ca0ff64e399982e279cfa131e7cb1aa Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:20 +0800 Subject: iommu: Make iommu_queue_iopf() more generic Make iommu_queue_iopf() more generic by making the iopf_group a minimal set of iopf's that an iopf handler of domain should handle and respond to. Add domain parameter to struct iopf_group so that the handler can retrieve and use it directly. Change iommu_queue_iopf() to forward groups of iopf's to the domain's iopf handler. This is also a necessary step to decouple the sva iopf handling code from this interface. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c9d4f175f121..791f183a988e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -130,6 +130,7 @@ struct iopf_group { struct list_head faults; struct work_struct work; struct device *dev; + struct iommu_domain *domain; }; /** @@ -209,8 +210,7 @@ struct iommu_domain { unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; - enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault, - void *data); + int (*iopf_handler)(struct iopf_group *group); void *fault_data; union { struct { -- cgit v1.2.3 From 17c51a0ea36b800e7a5998a92d83016c82935dff Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:21 +0800 Subject: iommu: Separate SVA and IOPF Add CONFIG_IOMMU_IOPF for page fault handling framework and select it from its real consumer. Move iopf function declaration from iommu-sva.h to iommu.h and remove iommu-sva.h as it's empty now. Consolidate all SVA related code into iommu-sva.c: - Move iommu_sva_domain_alloc() from iommu.c to iommu-sva.c. - Move sva iopf handling code from io-pgfault.c to iommu-sva.c. Consolidate iommu_report_device_fault() and iommu_page_response() into io-pgfault.c. Export iopf_free_group() and iopf_group_response() for iopf handlers implemented in modules. Some functions are renamed with more meaningful names. No other intentional functionality changes. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Tested-by: Longfang Liu Link: https://lore.kernel.org/r/20240212012227.119381-11-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 98 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 791f183a988e..fc912aed7886 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -810,10 +810,6 @@ extern struct iommu_group *iommu_group_get(struct device *dev); extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group); extern void iommu_group_put(struct iommu_group *group); -extern int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); -extern int iommu_page_response(struct device *dev, - struct iommu_page_response *msg); - extern int iommu_group_id(struct iommu_group *group); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); @@ -1029,8 +1025,6 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group); int iommu_device_claim_dma_owner(struct device *dev, void *owner); void iommu_device_release_dma_owner(struct device *dev); -struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, - struct mm_struct *mm); int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); void iommu_detach_device_pasid(struct iommu_domain *domain, @@ -1219,18 +1213,6 @@ static inline void iommu_group_put(struct iommu_group *group) { } -static inline -int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) -{ - return -ENODEV; -} - -static inline int iommu_page_response(struct device *dev, - struct iommu_page_response *msg) -{ - return -ENODEV; -} - static inline int iommu_group_id(struct iommu_group *group) { return -ENODEV; @@ -1379,12 +1361,6 @@ static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner) return -ENODEV; } -static inline struct iommu_domain * -iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) -{ - return NULL; -} - static inline int iommu_attach_device_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid) { @@ -1524,6 +1500,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1548,6 +1526,78 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} + +static inline struct iommu_domain * +iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) +{ + return NULL; +} #endif /* CONFIG_IOMMU_SVA */ +#ifdef CONFIG_IOMMU_IOPF +int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev); +int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev); +int iopf_queue_flush_dev(struct device *dev); +struct iopf_queue *iopf_queue_alloc(const char *name); +void iopf_queue_free(struct iopf_queue *queue); +int iopf_queue_discard_partial(struct iopf_queue *queue); +void iopf_free_group(struct iopf_group *group); +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); +int iommu_page_response(struct device *dev, struct iommu_page_response *msg); +int iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status); +#else +static inline int +iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) +{ + return -ENODEV; +} + +static inline int +iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) +{ + return -ENODEV; +} + +static inline int iopf_queue_flush_dev(struct device *dev) +{ + return -ENODEV; +} + +static inline struct iopf_queue *iopf_queue_alloc(const char *name) +{ + return NULL; +} + +static inline void iopf_queue_free(struct iopf_queue *queue) +{ +} + +static inline int iopf_queue_discard_partial(struct iopf_queue *queue) +{ + return -ENODEV; +} + +static inline void iopf_free_group(struct iopf_group *group) +{ +} + +static inline int +iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) +{ + return -ENODEV; +} + +static inline int +iommu_page_response(struct device *dev, struct iommu_page_response *msg) +{ + return -ENODEV; +} + +static inline int iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status) +{ + return -ENODEV; +} +#endif /* CONFIG_IOMMU_IOPF */ #endif /* __LINUX_IOMMU_H */ -- cgit v1.2.3 From a74c077b9021b36c785095c571336e5b204d3c2d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:23 +0800 Subject: iommu: Use refcount for fault data access The per-device fault data structure stores information about faults occurring on a device. Its lifetime spans from IOPF enablement to disablement. Multiple paths, including IOPF reporting, handling, and responding, may access it concurrently. Previously, a mutex protected the fault data from use after free. But this is not performance friendly due to the critical nature of IOPF handling paths. Refine this with a refcount-based approach. The fault data pointer is obtained within an RCU read region with a refcount. The fault data pointer is returned for usage only when the pointer is valid and a refcount is successfully obtained. The fault data is freed with kfree_rcu(), ensuring data is only freed after all RCU critical regions complete. An iopf handling work starts once an iopf group is created. The handling work continues until iommu_page_response() is called to respond to the iopf and the iopf group is freed. During this time, the device fault parameter should always be available. Add a pointer to the device fault parameter in the iopf_group structure and hold the reference until the iopf_group is freed. Make iommu_page_response() static as it is only used in io-pgfault.c. Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Link: https://lore.kernel.org/r/20240212012227.119381-13-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index fc912aed7886..1e9161ae95da 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -41,6 +41,7 @@ struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; +struct iommu_fault_param; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -129,8 +130,9 @@ struct iopf_group { struct iopf_fault last_fault; struct list_head faults; struct work_struct work; - struct device *dev; struct iommu_domain *domain; + /* The device's fault data parameter. */ + struct iommu_fault_param *fault_param; }; /** @@ -679,6 +681,8 @@ struct iommu_device { /** * struct iommu_fault_param - per-device IOMMU fault data * @lock: protect pending faults list + * @users: user counter to manage the lifetime of the data + * @rcu: rcu head for kfree_rcu() * @dev: the device that owns this param * @queue: IOPF queue * @queue_list: index into queue->devices @@ -688,6 +692,8 @@ struct iommu_device { */ struct iommu_fault_param { struct mutex lock; + refcount_t users; + struct rcu_head rcu; struct device *dev; struct iopf_queue *queue; @@ -715,7 +721,7 @@ struct iommu_fault_param { */ struct dev_iommu { struct mutex lock; - struct iommu_fault_param *fault_param; + struct iommu_fault_param __rcu *fault_param; struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void *priv; @@ -1543,7 +1549,6 @@ void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); void iopf_free_group(struct iopf_group *group); int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); -int iommu_page_response(struct device *dev, struct iommu_page_response *msg); int iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status); #else @@ -1588,12 +1593,6 @@ iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) return -ENODEV; } -static inline int -iommu_page_response(struct device *dev, struct iommu_page_response *msg) -{ - return -ENODEV; -} - static inline int iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status) { -- cgit v1.2.3 From 0095bf83554f8e7a681961656608101bdf40e9ef Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:24 +0800 Subject: iommu: Improve iopf_queue_remove_device() Convert iopf_queue_remove_device() to return void instead of an error code, as the return value is never used. This removal helper is designed to be never-failed, so there's no need for error handling. Ack all outstanding page requests from the device with the response code of IOMMU_PAGE_RESP_INVALID, indicating device should not attempt any retry. Add comments to this helper explaining the steps involved in removing a device from the iopf queue and disabling its PRI. The individual drivers are expected to be adjusted accordingly. Here we just define the expected behaviors of the individual iommu driver from the core's perspective. Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Yan Zhao Link: https://lore.kernel.org/r/20240212012227.119381-14-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1e9161ae95da..92dfd9b94577 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1542,7 +1542,7 @@ iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) #ifdef CONFIG_IOMMU_IOPF int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev); -int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev); +void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev); int iopf_queue_flush_dev(struct device *dev); struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); @@ -1558,10 +1558,9 @@ iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) return -ENODEV; } -static inline int +static inline void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) { - return -ENODEV; } static inline int iopf_queue_flush_dev(struct device *dev) -- cgit v1.2.3 From 19911232713573a2ebea84a25bd4d71d024ed86b Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:25 +0800 Subject: iommu: Track iopf group instead of last fault Previously, before a group of page faults was passed to the domain's iopf handler, the last page fault of the group was kept in the list of iommu_fault_param::faults. In the page fault response path, the group's last page fault was used to look up the list, and the page faults were responded to device only if there was a matched fault. The previous approach seems unnecessarily complex and not performance friendly. Put the page fault group itself to the outstanding fault list. It can be removed in the page fault response path or in the iopf_queue_remove_device() path. The pending list is protected by iommu_fault_param::lock. To allow checking for the group's presence in the list using list_empty(), the iopf group should be removed from the list with list_del_init(). IOMMU_PAGE_RESP_PASID_VALID is set in the code but not used anywhere. Remove it to make the code clean. IOMMU_PAGE_RESP_PASID_VALID is set in the response message indicating that the response message includes a valid PASID value. Actually, we should keep this hardware detail in the individual driver. When the page fault handling framework in IOMMU and IOMMUFD subsystems includes a valid PASID in the fault message, the response message should always contain the same PASID value. Individual drivers should be responsible for deciding whether to include the PASID in the messages they provide for the hardware. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Yan Zhao Link: https://lore.kernel.org/r/20240212012227.119381-15-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 92dfd9b94577..f8ed1cc7212e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -106,15 +106,11 @@ enum iommu_page_response_code { /** * struct iommu_page_response - Generic page response information - * @flags: encodes whether the corresponding fields are valid - * (IOMMU_FAULT_PAGE_RESPONSE_* values) * @pasid: Process Address Space ID * @grpid: Page Request Group Index * @code: response code from &enum iommu_page_response_code */ struct iommu_page_response { -#define IOMMU_PAGE_RESP_PASID_VALID (1 << 0) - u32 flags; u32 pasid; u32 grpid; u32 code; @@ -129,6 +125,8 @@ struct iopf_fault { struct iopf_group { struct iopf_fault last_fault; struct list_head faults; + /* list node for iommu_fault_param::faults */ + struct list_head pending_node; struct work_struct work; struct iommu_domain *domain; /* The device's fault data parameter. */ -- cgit v1.2.3 From b554e396e51ce3d378a560666f85c6836a8323fd Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:26 +0800 Subject: iommu: Make iopf_group_response() return void The iopf_group_response() should return void, as nothing can do anything with the failure. This implies that ops->page_response() must also return void; this is consistent with what the drivers do. The failure paths, which are all integrity validations of the fault, should be WARN_ON'd, not return codes. If the iommu core fails to enqueue the fault, it should respond the fault directly by calling ops->page_response() instead of returning an error number and relying on the iommu drivers to do so. Consolidate the error fault handling code in the core. Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240212012227.119381-16-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f8ed1cc7212e..f632775414a5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -574,9 +574,8 @@ struct iommu_ops { int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - int (*page_response)(struct device *dev, - struct iopf_fault *evt, - struct iommu_page_response *msg); + void (*page_response)(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid); @@ -1547,8 +1546,8 @@ void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); void iopf_free_group(struct iopf_group *group); int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); -int iopf_group_response(struct iopf_group *group, - enum iommu_page_response_code status); +void iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status); #else static inline int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev) @@ -1590,10 +1589,9 @@ iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) return -ENODEV; } -static inline int iopf_group_response(struct iopf_group *group, - enum iommu_page_response_code status) +static inline void iopf_group_response(struct iopf_group *group, + enum iommu_page_response_code status) { - return -ENODEV; } #endif /* CONFIG_IOMMU_IOPF */ #endif /* __LINUX_IOMMU_H */ -- cgit v1.2.3 From 3dfa64aecbafc288216b2790438d395add192c30 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 12 Feb 2024 09:22:27 +0800 Subject: iommu: Make iommu_report_device_fault() return void As the iommu_report_device_fault() has been converted to auto-respond a page fault if it fails to enqueue it, there's no need to return a code in any case. Make it return void. Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240212012227.119381-17-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f632775414a5..7cc56cfe98dd 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1545,7 +1545,7 @@ struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); void iopf_free_group(struct iopf_group *group); -int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); +void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt); void iopf_group_response(struct iopf_group *group, enum iommu_page_response_code status); #else @@ -1583,10 +1583,9 @@ static inline void iopf_free_group(struct iopf_group *group) { } -static inline int +static inline void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { - return -ENODEV; } static inline void iopf_group_response(struct iopf_group *group, -- cgit v1.2.3 From e01c9797c0ebb307c9bb196c677f6e571335773e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 16 Feb 2024 14:45:14 +0100 Subject: PCI: endpoint: Clean up hardware description for BARs The hardware description for BARs is scattered in many different variables in pci_epc_features. Some of these things are mutually exclusive, so it can create confusion over which variable that has precedence over another. Improve the situation by creating a struct pci_epc_bar_desc, and a new enum pci_epc_bar_type, and convert the endpoint controller drivers to use this more well defined format. Additionally, some endpoint controller drivers mark the BAR succeeding a "64-bit only BAR" as reserved, while some do not. By definition, a 64-bit BAR uses the succeeding BAR for the upper 32-bits, so an EPF driver cannot use a BAR succeeding a 64-bit BAR. Ensure that all endpoint controller drivers are uniform, and actually describe a reserved BAR as reserved. Signed-off-by: Niklas Cassel Reviewed-by: Kishon Vijay Abraham I Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240216134524.1142149-2-cassel@kernel.org Signed-off-by: Manivannan Sadhasivam --- include/linux/pci-epc.h | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 40ea18f5aa02..4ccb4f4f3883 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -145,6 +145,32 @@ struct pci_epc { unsigned long function_num_map; }; +/** + * @BAR_PROGRAMMABLE: The BAR mask can be configured by the EPC. + * @BAR_FIXED: The BAR mask is fixed by the hardware. + * @BAR_RESERVED: The BAR should not be touched by an EPF driver. + */ +enum pci_epc_bar_type { + BAR_PROGRAMMABLE = 0, + BAR_FIXED, + BAR_RESERVED, +}; + +/** + * struct pci_epc_bar_desc - hardware description for a BAR + * @type: the type of the BAR + * @fixed_size: the fixed size, only applicable if type is BAR_FIXED_MASK. + * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR + * should be configured as 32-bit or 64-bit, the EPF driver must + * configure this BAR as 64-bit. Additionally, the BAR succeeding + * this BAR must be set to type BAR_RESERVED. + */ +struct pci_epc_bar_desc { + enum pci_epc_bar_type type; + u64 fixed_size; + bool only_64bit; +}; + /** * struct pci_epc_features - features supported by a EPC device per function * @linkup_notifier: indicate if the EPC device can notify EPF driver on link up @@ -152,9 +178,7 @@ struct pci_epc { * for initialization * @msi_capable: indicate if the endpoint function has MSI capability * @msix_capable: indicate if the endpoint function has MSI-X capability - * @reserved_bar: bitmap to indicate reserved BAR unavailable to function driver - * @bar_fixed_64bit: bitmap to indicate fixed 64bit BARs - * @bar_fixed_size: Array specifying the size supported by each BAR + * @bar: array specifying the hardware description for each BAR * @align: alignment size required for BAR buffer allocation */ struct pci_epc_features { @@ -162,9 +186,7 @@ struct pci_epc_features { unsigned int core_init_notifier : 1; unsigned int msi_capable : 1; unsigned int msix_capable : 1; - u8 reserved_bar; - u8 bar_fixed_64bit; - u64 bar_fixed_size[PCI_STD_NUM_BARS]; + struct pci_epc_bar_desc bar[PCI_STD_NUM_BARS]; size_t align; }; -- cgit v1.2.3 From 9266514689fe6476423209ee40168db53134101d Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 16 Feb 2024 14:45:15 +0100 Subject: PCI: endpoint: Drop only_64bit on reserved BARs The definition of a reserved BAR is that EPF drivers should not touch them. The definition of only_64bit is that the EPF driver must configure this BAR as 64-bit. (An EPF driver is not allowed to choose if this BAR should be configured as 32-bit or 64-bit.) Thus, it does not make sense to put only_64bit of a BAR that EPF drivers are not allow to touch. Drop the only_64bit property from hardware descriptions that are of type reserved BAR. Signed-off-by: Niklas Cassel Reviewed-by: Kishon Vijay Abraham I Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240216134524.1142149-3-cassel@kernel.org Signed-off-by: Manivannan Sadhasivam --- include/linux/pci-epc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 4ccb4f4f3883..cc2f70d061c8 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -164,6 +164,11 @@ enum pci_epc_bar_type { * should be configured as 32-bit or 64-bit, the EPF driver must * configure this BAR as 64-bit. Additionally, the BAR succeeding * this BAR must be set to type BAR_RESERVED. + * + * only_64bit should not be set on a BAR of type BAR_RESERVED. + * (If BARx is a 64-bit BAR that an EPF driver is not allowed to + * touch, then both BARx and BARx+1 must be set to type + * BAR_RESERVED.) */ struct pci_epc_bar_desc { enum pci_epc_bar_type type; -- cgit v1.2.3 From 31a5c0b7c674977889ce721d69101bc35f25e041 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:15 +0000 Subject: tick/nohz: Move tick_nohz_full_mask declaration outside the #ifdef tick_nohz_full_mask lists the CPUs that are nohz_full. This is only needed when CONFIG_NO_HZ_FULL is defined. tick_nohz_full_cpu() allows a specific CPU to be tested against the mask, and evaluates to false when CONFIG_NO_HZ_FULL is not defined. The resctrl code needs to pick a CPU to run some work on, a new helper prefers housekeeping CPUs by examining the tick_nohz_full_mask. Hiding the declaration behind #ifdef CONFIG_NO_HZ_FULL forces all the users to be behind an #ifdef too. Move the tick_nohz_full_mask declaration, this lets callers drop the #ifdef, and guard access to tick_nohz_full_mask with IS_ENABLED() or something like tick_nohz_full_cpu(). The definition does not need to be moved as any callers should be removed at compile time unless CONFIG_NO_HZ_FULL is defined. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Thomas Gleixner Acked-by: Reinette Chatre # for resctrl dependency Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-2-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/tick.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 716d17f31c45..0fb903838dfb 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -164,9 +164,16 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #endif /* !CONFIG_NO_HZ_COMMON */ +/* + * Mask of CPUs that are nohz_full. + * + * Users should be guarded by CONFIG_NO_HZ_FULL or a tick_nohz_full_cpu() + * check. + */ +extern cpumask_var_t tick_nohz_full_mask; + #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; -extern cpumask_var_t tick_nohz_full_mask; static inline bool tick_nohz_full_enabled(void) { -- cgit v1.2.3 From 40fc735b78f0c81cea7d1c511cfd83892cb4d679 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:19 +0000 Subject: x86/resctrl: Track the closid with the rmid x86's RMID are independent of the CLOSID. An RMID can be allocated, used and freed without considering the CLOSID. MPAM's equivalent feature is PMG, which is not an independent number, it extends the CLOSID/PARTID space. For MPAM, only PMG-bits worth of 'RMID' can be allocated for a single CLOSID. i.e. if there is 1 bit of PMG space, then each CLOSID can have two monitor groups. To allow resctrl to disambiguate RMID values for different CLOSID, everything in resctrl that keeps an RMID value needs to know the CLOSID too. This will always be ignored on x86. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Xin Hao Reviewed-by: Reinette Chatre Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-6-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/resctrl.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 66942d7fba7f..bd4ec22b5a96 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -6,6 +6,10 @@ #include #include +/* CLOSID, RMID value used by the default control group */ +#define RESCTRL_RESERVED_CLOSID 0 +#define RESCTRL_RESERVED_RMID 0 + #ifdef CONFIG_PROC_CPU_RESCTRL int proc_resctrl_show(struct seq_file *m, @@ -225,6 +229,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * for this resource and domain. * @r: resource that the counter should be read from. * @d: domain that the counter should be read from. + * @closid: closid that matches the rmid. Depending on the architecture, the + * counter may match traffic of both @closid and @rmid, or @rmid + * only. * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. @@ -235,20 +242,25 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * 0 on success, or -EIO, -EINVAL etc on error. */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val); + u32 closid, u32 rmid, enum resctrl_event_id eventid, + u64 *val); + /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid * and eventid. * @r: The domain's resource. * @d: The rmid's domain. + * @closid: closid that matches the rmid. Depending on the architecture, the + * counter may match traffic of both @closid and @rmid, or @rmid only. * @rmid: The rmid whose counter values should be reset. * @eventid: The eventid whose counter values should be reset. * * This can be called from any CPU. */ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid); + u32 closid, u32 rmid, + enum resctrl_event_id eventid); /** * resctrl_arch_reset_rmid_all() - Reset all private state associated with -- cgit v1.2.3 From 6fde1424f29b151b9dc8c660eecf4d1645facea5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:28 +0000 Subject: x86/resctrl: Allow resctrl_arch_rmid_read() to sleep MPAM's cache occupancy counters can take a little while to settle once the monitor has been configured. The maximum settling time is described to the driver via a firmware table. The value could be large enough that it makes sense to sleep. To avoid exposing this to resctrl, it should be hidden behind MPAM's resctrl_arch_rmid_read(). resctrl_arch_rmid_read() may be called via IPI meaning it is unable to sleep. In this case, it should return an error if it needs to sleep. This will only affect MPAM platforms where the cache occupancy counter isn't available immediately, nohz_full is in use, and there are no housekeeping CPUs in the necessary domain. There are three callers of resctrl_arch_rmid_read(): __mon_event_count() and __check_limbo() are both called from a non-migrateable context. mon_event_read() invokes __mon_event_count() using smp_call_on_cpu(), which adds work to the target CPUs workqueue. rdtgroup_mutex() is held, meaning this cannot race with the resctrl cpuhp callback. __check_limbo() is invoked via schedule_delayed_work_on() also adds work to a per-cpu workqueue. The remaining call is add_rmid_to_limbo() which is called in response to a user-space syscall that frees an RMID. This opportunistically reads the LLC occupancy counter on the current domain to see if the RMID is over the dirty threshold. This has to disable preemption to avoid reading the wrong domain's value. Disabling preemption here prevents resctrl_arch_rmid_read() from sleeping. add_rmid_to_limbo() walks each domain, but only reads the counter on one domain. If the system has more than one domain, the RMID will always be added to the limbo list. If the RMIDs usage was not over the threshold, it will be removed from the list when __check_limbo() runs. Make this the default behaviour. Free RMIDs are always added to the limbo list for each domain. The user visible effect of this is that a clean RMID is not available for re-allocation immediately after 'rmdir()' completes. This behaviour was never portable as it never happened on a machine with multiple domains. Removing this path allows resctrl_arch_rmid_read() to sleep if its called with interrupts unmasked. Document this is the expected behaviour, and add a might_sleep() annotation to catch changes that won't work on arm64. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-15-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/resctrl.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index bd4ec22b5a96..8649fc84aac2 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -236,7 +236,12 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. * - * Call from process context on a CPU that belongs to domain @d. + * Some architectures need to sleep when first programming some of the counters. + * (specifically: arm64's MPAM cache occupancy counters can return 'not ready' + * for a short period of time). Call from a non-migrateable process context on + * a CPU that belongs to domain @d. e.g. use smp_call_on_cpu() or + * schedule_work_on(). This function can be called with interrupts masked, + * e.g. using smp_call_function_any(), but may consistently return an error. * * Return: * 0 on success, or -EIO, -EINVAL etc on error. @@ -245,6 +250,22 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, u64 *val); +/** + * resctrl_arch_rmid_read_context_check() - warn about invalid contexts + * + * When built with CONFIG_DEBUG_ATOMIC_SLEEP generate a warning when + * resctrl_arch_rmid_read() is called with preemption disabled. + * + * The contract with resctrl_arch_rmid_read() is that if interrupts + * are unmasked, it can sleep. This allows NOHZ_FULL systems to use an + * IPI, (and fail if the call needed to sleep), while most of the time + * the work is scheduled, allowing the call to sleep. + */ +static inline void resctrl_arch_rmid_read_context_check(void) +{ + if (!irqs_disabled()) + might_sleep(); +} /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid -- cgit v1.2.3 From e557999f80a5ee4ec812f594ab42bb76c3ec4eb2 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:29 +0000 Subject: x86/resctrl: Allow arch to allocate memory needed in resctrl_arch_rmid_read() Depending on the number of monitors available, Arm's MPAM may need to allocate a monitor prior to reading the counter value. Allocating a contended resource may involve sleeping. __check_limbo() and mon_event_count() each make multiple calls to resctrl_arch_rmid_read(), to avoid extra work on contended systems, the allocation should be valid for multiple invocations of resctrl_arch_rmid_read(). The memory or hardware allocated is not specific to a domain. Add arch hooks for this allocation, which need calling before resctrl_arch_rmid_read(). The allocated monitor is passed to resctrl_arch_rmid_read(), then freed again afterwards. The helper can be called on any CPU, and can sleep. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-16-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/resctrl.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 8649fc84aac2..bf460c912bf5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -235,6 +235,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. * @val: result of the counter read in bytes. + * @arch_mon_ctx: An architecture specific value from + * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies + * the hardware monitor allocated for this read request. * * Some architectures need to sleep when first programming some of the counters. * (specifically: arm64's MPAM cache occupancy counters can return 'not ready' @@ -248,7 +251,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); */ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val); + u64 *val, void *arch_mon_ctx); /** * resctrl_arch_rmid_read_context_check() - warn about invalid contexts -- cgit v1.2.3 From 1b3e50ce7f5001f1e0edaf7d6abea43b264db7ee Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:34 +0000 Subject: x86/resctrl: Add CPU online callback for resctrl work The resctrl architecture specific code may need to create a domain when a CPU comes online, it also needs to reset the CPUs PQR_ASSOC register. The resctrl filesystem code needs to update the rdtgroup_default CPU mask when CPUs are brought online. Currently, this is all done in one function, resctrl_online_cpu(). It will need to be split into architecture and filesystem parts before resctrl can be moved to /fs/. Pull the rdtgroup_default update work out as a filesystem specific cpu_online helper. resctrl_online_cpu() is the obvious name for this, which means the version in core.c needs renaming. resctrl_online_cpu() is called by the arch code once it has done the work to add the new CPU to any domains. In future patches, resctrl_online_cpu() will take the rdtgroup_mutex itself. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-21-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/resctrl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index bf460c912bf5..4c4bad3c34e4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -223,6 +223,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); +void resctrl_online_cpu(unsigned int cpu); /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid -- cgit v1.2.3 From 978fcca954cb52249babbc14e53de53c88dd6433 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:35 +0000 Subject: x86/resctrl: Allow overflow/limbo handlers to be scheduled on any-but CPU When a CPU is taken offline resctrl may need to move the overflow or limbo handlers to run on a different CPU. Once the offline callbacks have been split, cqm_setup_limbo_handler() will be called while the CPU that is going offline is still present in the CPU mask. Pass the CPU to exclude to cqm_setup_limbo_handler() and mbm_setup_overflow_handler(). These functions can use a variant of cpumask_any_but() when selecting the CPU. -1 is used to indicate no CPUs need excluding. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Babu Moger Reviewed-by: Reinette Chatre Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-22-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/resctrl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 4c4bad3c34e4..ccbbbe5d18d3 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -10,6 +10,8 @@ #define RESCTRL_RESERVED_CLOSID 0 #define RESCTRL_RESERVED_RMID 0 +#define RESCTRL_PICK_ANY_CPU -1 + #ifdef CONFIG_PROC_CPU_RESCTRL int proc_resctrl_show(struct seq_file *m, -- cgit v1.2.3 From 258c91e84fedc789353a35ad91d827a9111d3cbd Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:36 +0000 Subject: x86/resctrl: Add CPU offline callback for resctrl work The resctrl architecture specific code may need to free a domain when a CPU goes offline, it also needs to reset the CPUs PQR_ASSOC register. Amongst other things, the resctrl filesystem code needs to clear this CPU from the cpu_mask of any control and monitor groups. Currently, this is all done in core.c and called from resctrl_offline_cpu(), making the split between architecture and filesystem code unclear. Move the filesystem work to remove the CPU from the control and monitor groups into a filesystem helper called resctrl_offline_cpu(), and rename the one in core.c resctrl_arch_offline_cpu(). Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-23-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/resctrl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index ccbbbe5d18d3..270ff1d5c051 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -226,6 +226,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d); void resctrl_online_cpu(unsigned int cpu); +void resctrl_offline_cpu(unsigned int cpu); /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid -- cgit v1.2.3 From f1cebae1dbf85f9de65c13a2d9f5cc3be7e51dc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Mon, 12 Feb 2024 09:50:06 -0500 Subject: firmware: coreboot: Generate aliases for coreboot modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generate aliases for coreboot modules to allow automatic module probing. Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Brian Norris Signed-off-by: Nícolas F. R. A. Prado Acked-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240212-coreboot-mod-defconfig-v4-2-d14172676f6d@collabora.com Signed-off-by: Tzung-Bi Shih --- include/linux/mod_devicetable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index f458469c5ce5..7a9a07ea451b 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -960,4 +960,14 @@ struct vchiq_device_id { char name[32]; }; +/** + * struct coreboot_device_id - Identifies a coreboot table entry + * @tag: tag ID + * @driver_data: driver specific data + */ +struct coreboot_device_id { + __u32 tag; + kernel_ulong_t driver_data; +}; + #endif /* LINUX_MOD_DEVICETABLE_H */ -- cgit v1.2.3 From 12b8ae68f50de200c038246c2496822f38b18fe2 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Wed, 7 Feb 2024 17:50:59 +0800 Subject: crypto: hisilicon/qm - add stop function by hardware Hardware V3 could be able to drain function by sending mailbox to hardware which will trigger tasks in device to be flushed out. When the function is reset, the function can be stopped by this way. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 5f4c74facf6a..720f10874a66 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -43,6 +43,7 @@ #define QM_MB_CMD_CQC_BT 0x5 #define QM_MB_CMD_SQC_VFT_V2 0x6 #define QM_MB_CMD_STOP_QP 0x8 +#define QM_MB_CMD_FLUSH_QM 0x9 #define QM_MB_CMD_SRC 0xc #define QM_MB_CMD_DST 0xd @@ -151,6 +152,7 @@ enum qm_cap_bits { QM_SUPPORT_DB_ISOLATION = 0x0, QM_SUPPORT_FUNC_QOS, QM_SUPPORT_STOP_QP, + QM_SUPPORT_STOP_FUNC, QM_SUPPORT_MB_COMMAND, QM_SUPPORT_SVA_PREFETCH, QM_SUPPORT_RPM, -- cgit v1.2.3 From ce133a22123055f5f988499cd9ac7953d2bf0677 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Wed, 7 Feb 2024 17:51:00 +0800 Subject: crypto: hisilicon/qm - obtain stop queue status The debugfs files 'dev_state' and 'dev_timeout' are added. Users can query the current queue stop status through these two files. And set the waiting timeout when the queue is released. dev_state: if dev_timeout is set, dev_state indicates the status of stopping the queue. 0 indicates that the queue is stopped successfully. Other values indicate that the queue stops fail. If dev_timeout is not set, the value of dev_state is 0; dev_timeout: if the queue fails to stop, the queue is released after waiting dev_timeout * 20ms. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 720f10874a66..2d14742ad729 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -163,6 +163,11 @@ struct qm_dev_alg { const char *alg; }; +struct qm_dev_dfx { + u32 dev_state; + u32 dev_timeout; +}; + struct dfx_diff_registers { u32 *regs; u32 reg_offset; @@ -191,6 +196,7 @@ struct qm_debug { struct dentry *debug_root; struct dentry *qm_d; struct debugfs_file files[DEBUG_FILE_NUM]; + struct qm_dev_dfx dev_dfx; unsigned int *qm_last_words; /* ACC engines recoreding last regs */ unsigned int *last_words; -- cgit v1.2.3 From 9066ac364d8659ab7c993b83c60a6182c3ec1ef9 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Wed, 7 Feb 2024 17:51:01 +0800 Subject: crypto: hisilicon/qm - change function type to void The function qm_stop_qp_nolock() always return zero, so function type is changed to void. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 2d14742ad729..9d7754ad5e9b 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -531,7 +531,7 @@ void hisi_qm_uninit(struct hisi_qm *qm); int hisi_qm_start(struct hisi_qm *qm); int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r); int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg); -int hisi_qm_stop_qp(struct hisi_qp *qp); +void hisi_qm_stop_qp(struct hisi_qp *qp); int hisi_qp_send(struct hisi_qp *qp, const void *msg); void hisi_qm_debug_init(struct hisi_qm *qm); void hisi_qm_debug_regs_clear(struct hisi_qm *qm); -- cgit v1.2.3 From 9b99c17f7510bed2adbe17751fb8abddba5620bc Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 12 Jan 2024 12:09:50 -0800 Subject: x86/numa: Fix the address overlap check in numa_fill_memblks() numa_fill_memblks() fills in the gaps in numa_meminfo memblks over a physical address range. To do so, it first creates a list of existing memblks that overlap that address range. The issue is that it is off by one when comparing to the end of the address range, so memblks that do not overlap are selected. The impact of selecting a memblk that does not actually overlap is that an existing memblk may be filled when the expected action is to do nothing and return NUMA_NO_MEMBLK to the caller. The caller can then add a new NUMA node and memblk. Replace the broken open-coded search for address overlap with the memblock helper memblock_addrs_overlap(). Update the kernel doc and in code comments. Suggested by: "Huang, Ying" Fixes: 8f012db27c95 ("x86/numa: Introduce numa_fill_memblks()") Signed-off-by: Alison Schofield Acked-by: Mike Rapoport (IBM) Acked-by: Dave Hansen Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/10a3e6109c34c21a8dd4c513cf63df63481a2b07.1705085543.git.alison.schofield@intel.com Signed-off-by: Dan Williams --- include/linux/memblock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b695f9e946da..e2082240586d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -121,6 +121,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); int memblock_physmem_add(phys_addr_t base, phys_addr_t size); #endif void memblock_trim_memory(phys_addr_t align); +unsigned long memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); bool memblock_validate_numa_coverage(unsigned long threshold_bytes); -- cgit v1.2.3 From 2444a80c1cc2c4240f60f2162abef3797c1803de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Feb 2024 08:48:28 +0000 Subject: kobject: make uevent_seqnum atomic We will soon no longer acquire uevent_sock_mutex for most kobject_uevent_net_broadcast() calls, and also while calling uevent_net_broadcast(). Make uevent_seqnum an atomic64_t to get its own protection. This fixes a race while reading /sys/kernel/uevent_seqnum. Signed-off-by: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Christian Brauner Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20240214084829.684541-2-edumazet@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index c30affcc43b4..c8219505a79f 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -38,7 +38,7 @@ extern char uevent_helper[]; #endif /* counter to tag the uevent, read only except for the kobject core */ -extern u64 uevent_seqnum; +extern atomic64_t uevent_seqnum; /* * The actions here must match the index to the string array -- cgit v1.2.3 From 99f638dd49ca80538addec6b3733ddb5784c9373 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 30 Jan 2024 13:23:37 +0100 Subject: usb: gadget: Support already-mapped DMA SGs Add a new 'sg_was_mapped' field to the struct usb_request. This field can be used to indicate that the scatterlist associated to the USB transfer has already been mapped into the DMA space, and it does not have to be done internally. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20240130122340.54813-2-paul@crapouillou.net Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index a771ccc038ac..c529e4e06997 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -52,6 +52,7 @@ struct usb_ep; * @short_not_ok: When reading data, makes short packets be * treated as errors (queue stops advancing till cleanup). * @dma_mapped: Indicates if request has been mapped to DMA (internal) + * @sg_was_mapped: Set if the scatterlist has been mapped before the request * @complete: Function called when request completes, so this request and * its buffer may be re-used. The function will always be called with * interrupts disabled, and it must not sleep. @@ -111,6 +112,7 @@ struct usb_request { unsigned zero:1; unsigned short_not_ok:1; unsigned dma_mapped:1; + unsigned sg_was_mapped:1; void (*complete)(struct usb_ep *ep, struct usb_request *req); -- cgit v1.2.3 From 1dae0cb79ceacbdc7495c5f83ca71e1c12a24d7c Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 28 Jan 2024 15:05:28 +0000 Subject: iio: locking: introduce __cleanup() based direct mode claiming infrastructure Allows use of: iio_device_claim_direct_scoped(return -EBUSY, indio_dev) { } to automatically call iio_device_release_direct_mode() based on scope. Typically seen in combination with local device specific locks which are already have automated cleanup options via guard(mutex)(&st->lock) and scoped_guard(). Using both together allows most error handling to be automated. Reviewed-by: Nuno Sa Link: https://lore.kernel.org/r/20240128150537.44592-2-jic23@kernel.org Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index c5b36d2c1e73..4f89279e531c 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -9,6 +9,7 @@ #include #include +#include #include #include /* IIO TODO LIST */ @@ -638,6 +639,33 @@ int __devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev, int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp); int iio_device_claim_direct_mode(struct iio_dev *indio_dev); void iio_device_release_direct_mode(struct iio_dev *indio_dev); + +/* + * This autocleanup logic is normally used via + * iio_device_claim_direct_scoped(). + */ +DEFINE_GUARD(iio_claim_direct, struct iio_dev *, iio_device_claim_direct_mode(_T), + iio_device_release_direct_mode(_T)) + +DEFINE_GUARD_COND(iio_claim_direct, _try, ({ + struct iio_dev *dev; + int d = iio_device_claim_direct_mode(_T); + + if (d < 0) + dev = NULL; + else + dev = _T; + dev; + })) + +/** + * iio_device_claim_direct_scoped() - Scoped call to iio_device_claim_direct. + * @fail: What to do on failure to claim device. + * @iio_dev: Pointer to the IIO devices structure + */ +#define iio_device_claim_direct_scoped(fail, iio_dev) \ + scoped_cond_guard(iio_claim_direct_try, fail, iio_dev) + int iio_device_claim_buffer_mode(struct iio_dev *indio_dev); void iio_device_release_buffer_mode(struct iio_dev *indio_dev); -- cgit v1.2.3 From 89b1b86fc77367fac470258acdf470ffe2edc8d4 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Thu, 8 Feb 2024 16:37:19 -0300 Subject: iio: core: make iio_bus_type const Now that the driver core can properly handle constant struct bus_type, move the iio_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Acked-by: Nuno Sa Link: https://lore.kernel.org/r/20240208-bus_cleanup-iio-v1-1-4a167c3b5fb3@marliere.net Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 4f89279e531c..e370a7bb3300 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -669,7 +669,7 @@ DEFINE_GUARD_COND(iio_claim_direct, _try, ({ int iio_device_claim_buffer_mode(struct iio_dev *indio_dev); void iio_device_release_buffer_mode(struct iio_dev *indio_dev); -extern struct bus_type iio_bus_type; +extern const struct bus_type iio_bus_type; /** * iio_device_put() - reference counted deallocation of struct device -- cgit v1.2.3 From 3765d426fe864e109d00aeb48f22413b896b2eb9 Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Mon, 5 Feb 2024 10:59:25 -0800 Subject: iio: hid-sensor-als: Add light color temperature support On some platforms, ambient color sensors also support light color temperature. Add support of light color temperature. Signed-off-by: Basavaraj Natikar Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240205185926.3030521-4-srinivas.pandruvada@linux.intel.com Signed-off-by: Jonathan Cameron --- include/linux/hid-sensor-ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h index 13b1e65fbdcc..8af4fb3e0254 100644 --- a/include/linux/hid-sensor-ids.h +++ b/include/linux/hid-sensor-ids.h @@ -21,6 +21,7 @@ #define HID_USAGE_SENSOR_ALS 0x200041 #define HID_USAGE_SENSOR_DATA_LIGHT 0x2004d0 #define HID_USAGE_SENSOR_LIGHT_ILLUM 0x2004d1 +#define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE 0x2004d2 /* PROX (200011) */ #define HID_USAGE_SENSOR_PROX 0x200011 -- cgit v1.2.3 From 2ec17b1950bb824f9a8d5f055e466d02c40eb64c Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Mon, 5 Feb 2024 10:59:26 -0800 Subject: iio: hid-sensor-als: Add light chromaticity support On some platforms, ambient color sensors also support the x and y light colors, which represent the coordinates on the CIE 1931 chromaticity diagram. Add light chromaticity x and y. Signed-off-by: Basavaraj Natikar Signed-off-by: Srinivas Pandruvada Link: https://lore.kernel.org/r/20240205185926.3030521-5-srinivas.pandruvada@linux.intel.com Signed-off-by: Jonathan Cameron --- include/linux/hid-sensor-ids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h index 8af4fb3e0254..6730ee900ee1 100644 --- a/include/linux/hid-sensor-ids.h +++ b/include/linux/hid-sensor-ids.h @@ -22,6 +22,9 @@ #define HID_USAGE_SENSOR_DATA_LIGHT 0x2004d0 #define HID_USAGE_SENSOR_LIGHT_ILLUM 0x2004d1 #define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE 0x2004d2 +#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY 0x2004d3 +#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X 0x2004d4 +#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y 0x2004d5 /* PROX (200011) */ #define HID_USAGE_SENSOR_PROX 0x200011 -- cgit v1.2.3 From 80e4021c25d8c1ddae0dd655ed5f6b1e938dd79b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 14 Feb 2024 21:16:19 +0100 Subject: net: mdio: add helpers for accessing the EEE CAP2 registers This adds helpers for accessing the EEE CAP2 registers. For now only 2500baseT and 5000baseT modes are supported. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/mdio.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 79ceee3c8673..fd8ff310f9eb 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -439,6 +439,42 @@ static inline void mii_eee_cap1_mod_linkmode_t(unsigned long *adv, u32 val) adv, val & MDIO_EEE_10GKR); } +/** + * mii_eee_cap2_mod_linkmode_sup_t() + * @adv: target the linkmode settings + * @val: register value + * + * A function that translates value of following registers to the linkmode: + * IEEE 802.3-2022 45.2.3.11 "EEE control and capability 2" register (3.21) + */ +static inline void mii_eee_cap2_mod_linkmode_sup_t(unsigned long *adv, u32 val) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + adv, val & MDIO_EEE_2_5GT); + linkmode_mod_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT, + adv, val & MDIO_EEE_5GT); +} + +/** + * mii_eee_cap2_mod_linkmode_adv_t() + * @adv: target the linkmode advertisement settings + * @val: register value + * + * A function that translates value of following registers to the linkmode: + * IEEE 802.3-2022 45.2.7.16 "EEE advertisement 2" register (7.62) + * IEEE 802.3-2022 45.2.7.17 "EEE link partner ability 2" register (7.63) + * Note: Currently this function is the same as mii_eee_cap2_mod_linkmode_sup_t. + * For certain, not yet supported, modes however the bits differ. + * Therefore create separate functions already. + */ +static inline void mii_eee_cap2_mod_linkmode_adv_t(unsigned long *adv, u32 val) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + adv, val & MDIO_EEE_2_5GT); + linkmode_mod_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT, + adv, val & MDIO_EEE_5GT); +} + /** * linkmode_to_mii_eee_cap1_t() * @adv: the linkmode advertisement settings @@ -466,6 +502,25 @@ static inline u32 linkmode_to_mii_eee_cap1_t(unsigned long *adv) return result; } +/** + * linkmode_to_mii_eee_cap2_t() + * @adv: the linkmode advertisement settings + * + * A function that translates linkmode to value for IEEE 802.3-2022 45.2.7.16 + * "EEE advertisement 2" register (7.62) + */ +static inline u32 linkmode_to_mii_eee_cap2_t(unsigned long *adv) +{ + u32 result = 0; + + if (linkmode_test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, adv)) + result |= MDIO_EEE_2_5GT; + if (linkmode_test_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT, adv)) + result |= MDIO_EEE_5GT; + + return result; +} + /** * mii_10base_t1_adv_mod_linkmode_t() * @adv: linkmode advertisement settings -- cgit v1.2.3 From ef6ee3a31bdc699391f2db4eff407fdb06895809 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 14 Feb 2024 21:17:11 +0100 Subject: net: phy: add PHY_EEE_CAP2_FEATURES As a prerequisite for adding EEE CAP2 register support, complement PHY_EEE_CAP1_FEATURES with PHY_EEE_CAP2_FEATURES. For now only 2500baseT and 5000baseT modes are supported. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index c2dda21b39e1..e3ab2c347a59 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -54,6 +54,7 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init; +extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init; #define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features) #define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features) @@ -65,6 +66,7 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init; #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features) #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features) #define PHY_EEE_CAP1_FEATURES ((unsigned long *)&phy_eee_cap1_features) +#define PHY_EEE_CAP2_FEATURES ((unsigned long *)&phy_eee_cap2_features) extern const int phy_basic_ports_array[3]; extern const int phy_fibre_port_array[1]; -- cgit v1.2.3 From 4a92857d6e8383eca6d661538bb25dc7004fd391 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Feb 2024 14:52:17 +0100 Subject: gpio: constify opaque pointer "data" in gpio_device_find() The opaque pointer "data" in each match function used by gpio_device_find() is a pointer to const, thus the same argument passed to gpio_device_find() can adjusted similarly. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9d0023f83a57..9c1fbfaebaa8 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -628,7 +628,7 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, void *data, struct lock_class_key *lock_key, struct lock_class_key *request_key); -struct gpio_device *gpio_device_find(void *data, +struct gpio_device *gpio_device_find(const void *data, int (*match)(struct gpio_chip *gc, const void *data)); struct gpio_device *gpio_device_find_by_label(const char *label); -- cgit v1.2.3 From 548fcf037b3f8592e9fe41469110453a777416d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 15 Feb 2024 13:15:38 +0200 Subject: tty: Don't include tty_buffer.h in tty.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no need to include linux/tty_buffer.h in linux/tty.h. Move the include into tty_buffer.c that is actually using it. Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240215111538.1920-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 8c76fd97d4ad..2b2e6f0a54d6 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From d87c295f599cab2ab3b3df53a9098adba4a6002b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 30 Jan 2024 10:46:27 -0800 Subject: sysfs: Introduce a mechanism to hide static attribute_groups Add a mechanism for named attribute_groups to hide their directory at sysfs_update_group() time, or otherwise skip emitting the group directory when the group is first registered. It piggybacks on is_visible() in a similar manner as SYSFS_PREALLOC, i.e. special flags in the upper bits of the returned mode. To use it, specify a symbol prefix to DEFINE_SYSFS_GROUP_VISIBLE(), and then pass that same prefix to SYSFS_GROUP_VISIBLE() when assigning the @is_visible() callback: DEFINE_SYSFS_GROUP_VISIBLE($prefix) struct attribute_group $prefix_group = { .name = $name, .is_visible = SYSFS_GROUP_VISIBLE($prefix), }; SYSFS_GROUP_VISIBLE() expects a definition of $prefix_group_visible() and $prefix_attr_visible(), where $prefix_group_visible() just returns true / false and $prefix_attr_visible() behaves as normal. The motivation for this capability is to centralize PCI device authentication in the PCI core with a named sysfs group while keeping that group hidden for devices and platforms that do not meet the requirements. In a PCI topology, most devices will not support authentication, a small subset will support just PCI CMA (Component Measurement and Authentication), a smaller subset will support PCI CMA + PCIe IDE (Link Integrity and Encryption), and only next generation server hosts will start to include a platform TSM (TEE Security Manager). Without this capability the alternatives are: * Check if all attributes are invisible and if so, hide the directory. Beyond trouble getting this to work [1], this is an ABI change for scenarios if userspace happens to depend on group visibility absent any attributes. I.e. this new capability avoids regression since it does not retroactively apply to existing cases. * Publish an empty /sys/bus/pci/devices/$pdev/tsm/ directory for all PCI devices (i.e. for the case when TSM platform support is present, but device support is absent). Unfortunate that this will be a vestigial empty directory in the vast majority of cases. * Reintroduce usage of runtime calls to sysfs_{create,remove}_group() in the PCI core. Bjorn has already indicated that he does not want to see any growth of pci_sysfs_init() [2]. * Drop the named group and simulate a directory by prefixing all TSM-related attributes with "tsm_". Unfortunate to not use the naming capability of a sysfs group as intended. In comparison, there is a small potential for regression if for some reason an @is_visible() callback had dependencies on how many times it was called. Additionally, it is no longer an error to update a group that does not have its directory already present, and it is no longer a WARN() to remove a group that was never visible. Link: https://lore.kernel.org/all/2024012321-envious-procedure-4a58@gregkh/ [1] Link: https://lore.kernel.org/linux-pci/20231019200110.GA1410324@bhelgaas/ [2] Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/2024013028-deflator-flaring-ec62@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 63 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index b717a70219f6..a42642b277dd 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -61,22 +61,32 @@ do { \ /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name - * If specified, the attribute group will be created in - * a new subdirectory with this name. + * If specified, the attribute group will be created in a + * new subdirectory with this name. Additionally when a + * group is named, @is_visible and @is_bin_visible may + * return SYSFS_GROUP_INVISIBLE to control visibility of + * the directory itself. * @is_visible: Optional: Function to return permissions associated with an - * attribute of the group. Will be called repeatedly for each - * non-binary attribute in the group. Only read/write + * attribute of the group. Will be called repeatedly for + * each non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must - * return 0 if an attribute is not visible. The returned value - * will replace static permissions defined in struct attribute. + * return 0 if an attribute is not visible. The returned + * value will replace static permissions defined in struct + * attribute. Use SYSFS_GROUP_VISIBLE() when assigning this + * callback to specify separate _group_visible() and + * _attr_visible() handlers. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write - * permissions as well as SYSFS_PREALLOC are accepted. Must - * return 0 if a binary attribute is not visible. The returned - * value will replace static permissions defined in - * struct bin_attribute. + * permissions as well as SYSFS_PREALLOC (and the + * visibility flags for named groups) are accepted. Must + * return 0 if a binary attribute is not visible. The + * returned value will replace static permissions defined + * in struct bin_attribute. If @is_visible is not set, Use + * SYSFS_GROUP_VISIBLE() when assigning this callback to + * specify separate _group_visible() and _attr_visible() + * handlers. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. @@ -91,13 +101,42 @@ struct attribute_group { struct bin_attribute **bin_attrs; }; +#define SYSFS_PREALLOC 010000 +#define SYSFS_GROUP_INVISIBLE 020000 + +/* + * The first call to is_visible() in the create / update path may + * indicate visibility for the entire group + */ +#define DEFINE_SYSFS_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct attribute *attr, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return name##_attr_visible(kobj, attr, n); \ + } + +/* + * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary + * attributes + */ +#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct bin_attribute *attr, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return name##_attr_visible(kobj, attr, n); \ + } + +#define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn + /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ -#define SYSFS_PREALLOC 010000 - #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ -- cgit v1.2.3 From 1a4729ecafc239f922d0c758bab7be0038714e88 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 23 Jan 2024 17:46:55 +0100 Subject: hrtimers: Move hrtimer base related definitions into hrtimer_defs.h hrtimer base related struct definitions are part of hrtimers.h as it is required there. With this, also the struct documentation which is for core code internal use, is exposed into the general api. To prevent this, move all core internal definitions and the related includes into hrtimer_defs.h. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240123164702.55612-2-anna-maria@linutronix.de --- include/linux/hrtimer.h | 103 ------------------------------------------- include/linux/hrtimer_defs.h | 102 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 103 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 87e3bedf8eb0..792a0ac75378 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -18,12 +18,8 @@ #include #include #include -#include #include -struct hrtimer_clock_base; -struct hrtimer_cpu_base; - /* * Mode arguments of xxx_hrtimer functions: * @@ -98,105 +94,6 @@ struct hrtimer_sleeper { struct task_struct *task; }; -#ifdef CONFIG_64BIT -# define __hrtimer_clock_base_align ____cacheline_aligned -#else -# define __hrtimer_clock_base_align -#endif - -/** - * struct hrtimer_clock_base - the timer base for a specific clock - * @cpu_base: per cpu clock base - * @index: clock type index for per_cpu support when moving a - * timer to a base on another cpu. - * @clockid: clock id for per_cpu support - * @seq: seqcount around __run_hrtimer - * @running: pointer to the currently running hrtimer - * @active: red black tree root node for the active timers - * @get_time: function to retrieve the current time of the clock - * @offset: offset of this clock to the monotonic base - */ -struct hrtimer_clock_base { - struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; - seqcount_raw_spinlock_t seq; - struct hrtimer *running; - struct timerqueue_head active; - ktime_t (*get_time)(void); - ktime_t offset; -} __hrtimer_clock_base_align; - -enum hrtimer_base_type { - HRTIMER_BASE_MONOTONIC, - HRTIMER_BASE_REALTIME, - HRTIMER_BASE_BOOTTIME, - HRTIMER_BASE_TAI, - HRTIMER_BASE_MONOTONIC_SOFT, - HRTIMER_BASE_REALTIME_SOFT, - HRTIMER_BASE_BOOTTIME_SOFT, - HRTIMER_BASE_TAI_SOFT, - HRTIMER_MAX_CLOCK_BASES, -}; - -/** - * struct hrtimer_cpu_base - the per cpu clock bases - * @lock: lock protecting the base and associated clock bases - * and timers - * @cpu: cpu number - * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set_seq: Sequence counter of clock was set events - * @hres_active: State of high resolution mode - * @in_hrtirq: hrtimer_interrupt() is currently executing - * @hang_detected: The last hrtimer interrupt detected a hang - * @softirq_activated: displays, if the softirq is raised - update of softirq - * related settings is not required then. - * @nr_events: Total number of hrtimer interrupt events - * @nr_retries: Total number of hrtimer interrupt retries - * @nr_hangs: Total number of hrtimer interrupt hangs - * @max_hang_time: Maximum time spent in hrtimer_interrupt - * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are - * expired - * @timer_waiters: A hrtimer_cancel() invocation waits for the timer - * callback to finish. - * @expires_next: absolute time of the next event, is required for remote - * hrtimer enqueue; it is the total first expiry time (hard - * and soft hrtimer are taken into account) - * @next_timer: Pointer to the first expiring timer - * @softirq_expires_next: Time to check, if soft queues needs also to be expired - * @softirq_next_timer: Pointer to the first expiring softirq based timer - * @clock_base: array of clock bases for this cpu - * - * Note: next_timer is just an optimization for __remove_hrtimer(). - * Do not dereference the pointer because it is not reliable on - * cross cpu removals. - */ -struct hrtimer_cpu_base { - raw_spinlock_t lock; - unsigned int cpu; - unsigned int active_bases; - unsigned int clock_was_set_seq; - unsigned int hres_active : 1, - in_hrtirq : 1, - hang_detected : 1, - softirq_activated : 1; -#ifdef CONFIG_HIGH_RES_TIMERS - unsigned int nr_events; - unsigned short nr_retries; - unsigned short nr_hangs; - unsigned int max_hang_time; -#endif -#ifdef CONFIG_PREEMPT_RT - spinlock_t softirq_expiry_lock; - atomic_t timer_waiters; -#endif - ktime_t expires_next; - struct hrtimer *next_timer; - ktime_t softirq_expires_next; - struct hrtimer *softirq_next_timer; - struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; -} ____cacheline_aligned; - static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = time; diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 2d3e3c5fb946..b12869dba59a 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -3,6 +3,8 @@ #define _LINUX_HRTIMER_DEFS_H #include +#include +#include #ifdef CONFIG_HIGH_RES_TIMERS @@ -24,4 +26,104 @@ #endif +#ifdef CONFIG_64BIT +# define __hrtimer_clock_base_align ____cacheline_aligned +#else +# define __hrtimer_clock_base_align +#endif + +/** + * struct hrtimer_clock_base - the timer base for a specific clock + * @cpu_base: per cpu clock base + * @index: clock type index for per_cpu support when moving a + * timer to a base on another cpu. + * @clockid: clock id for per_cpu support + * @seq: seqcount around __run_hrtimer + * @running: pointer to the currently running hrtimer + * @active: red black tree root node for the active timers + * @get_time: function to retrieve the current time of the clock + * @offset: offset of this clock to the monotonic base + */ +struct hrtimer_clock_base { + struct hrtimer_cpu_base *cpu_base; + unsigned int index; + clockid_t clockid; + seqcount_raw_spinlock_t seq; + struct hrtimer *running; + struct timerqueue_head active; + ktime_t (*get_time)(void); + ktime_t offset; +} __hrtimer_clock_base_align; + +enum hrtimer_base_type { + HRTIMER_BASE_MONOTONIC, + HRTIMER_BASE_REALTIME, + HRTIMER_BASE_BOOTTIME, + HRTIMER_BASE_TAI, + HRTIMER_BASE_MONOTONIC_SOFT, + HRTIMER_BASE_REALTIME_SOFT, + HRTIMER_BASE_BOOTTIME_SOFT, + HRTIMER_BASE_TAI_SOFT, + HRTIMER_MAX_CLOCK_BASES, +}; + +/** + * struct hrtimer_cpu_base - the per cpu clock bases + * @lock: lock protecting the base and associated clock bases + * and timers + * @cpu: cpu number + * @active_bases: Bitfield to mark bases with active timers + * @clock_was_set_seq: Sequence counter of clock was set events + * @hres_active: State of high resolution mode + * @in_hrtirq: hrtimer_interrupt() is currently executing + * @hang_detected: The last hrtimer interrupt detected a hang + * @softirq_activated: displays, if the softirq is raised - update of softirq + * related settings is not required then. + * @nr_events: Total number of hrtimer interrupt events + * @nr_retries: Total number of hrtimer interrupt retries + * @nr_hangs: Total number of hrtimer interrupt hangs + * @max_hang_time: Maximum time spent in hrtimer_interrupt + * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are + * expired + * @timer_waiters: A hrtimer_cancel() invocation waits for the timer + * callback to finish. + * @expires_next: absolute time of the next event, is required for remote + * hrtimer enqueue; it is the total first expiry time (hard + * and soft hrtimer are taken into account) + * @next_timer: Pointer to the first expiring timer + * @softirq_expires_next: Time to check, if soft queues needs also to be expired + * @softirq_next_timer: Pointer to the first expiring softirq based timer + * @clock_base: array of clock bases for this cpu + * + * Note: next_timer is just an optimization for __remove_hrtimer(). + * Do not dereference the pointer because it is not reliable on + * cross cpu removals. + */ +struct hrtimer_cpu_base { + raw_spinlock_t lock; + unsigned int cpu; + unsigned int active_bases; + unsigned int clock_was_set_seq; + unsigned int hres_active : 1, + in_hrtirq : 1, + hang_detected : 1, + softirq_activated : 1; +#ifdef CONFIG_HIGH_RES_TIMERS + unsigned int nr_events; + unsigned short nr_retries; + unsigned short nr_hangs; + unsigned int max_hang_time; +#endif +#ifdef CONFIG_PREEMPT_RT + spinlock_t softirq_expiry_lock; + atomic_t timer_waiters; +#endif + ktime_t expires_next; + struct hrtimer *next_timer; + ktime_t softirq_expires_next; + struct hrtimer *softirq_next_timer; + struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; +} ____cacheline_aligned; + + #endif -- cgit v1.2.3 From ca2768bbf5c48d8c048877dfbceafcebc3f06fa6 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 23 Jan 2024 17:46:56 +0100 Subject: hrtimers: Update formatting of documentation Documentation of functions lacks the annotations which are used by kernel-doc and *.rst to make appearance in rendered documents more user-friendly. Use those annotations to improve user-friendliness. While at it prevent duplication of comments and use a reference instead. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240123164702.55612-3-anna-maria@linutronix.de --- include/linux/hrtimer.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 792a0ac75378..aa1e65ccb615 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -342,20 +342,12 @@ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); /** - * hrtimer_forward_now - forward the timer expiry so it expires after now + * hrtimer_forward_now() - forward the timer expiry so it expires after now * @timer: hrtimer to forward * @interval: the interval to forward * - * Forward the timer expiry so it will expire after the current time - * of the hrtimer clock base. Returns the number of overruns. - * - * Can be safely called from the callback function of @timer. If - * called from other contexts @timer must neither be enqueued nor - * running the callback and the caller needs to take care of - * serialization. - * - * Note: This only updates the timer expiry value and does not requeue - * the timer. + * It is a variant of hrtimer_forward(). The timer will expire after the current + * time of the hrtimer clock base. See hrtimer_forward() for details. */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) -- cgit v1.2.3 From c92a7eb6c642812fb08851e580973c3b83e0227c Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 23 Jan 2024 17:46:59 +0100 Subject: jiffies: Transform comment about time_* functions into DOC block This general note about time_* functions is also useful to be available in kernel documentation. Therefore transform it into a kernel-doc DOC block with proper formatting. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240123164702.55612-6-anna-maria@linutronix.de --- include/linux/jiffies.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index e0ae2a43e0eb..d9f1435a5a13 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -102,12 +102,15 @@ static inline u64 get_jiffies_64(void) } #endif -/* - * These inlines deal with timer wrapping correctly. You are - * strongly encouraged to use them: - * 1. Because people otherwise forget - * 2. Because if the timer wrap changes in future you won't have to - * alter your driver code. +/** + * DOC: General information about time_* inlines + * + * These inlines deal with timer wrapping correctly. You are strongly encouraged + * to use them: + * + * #. Because people otherwise forget + * #. Because if the timer wrap changes in future you won't have to alter your + * driver code. */ /** -- cgit v1.2.3 From 05013062a89fa4c2d5913dfb81a8ae0268e0a9dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 14 Feb 2024 10:31:54 +0100 Subject: pwm: lpss-*: Make use of devm_pwmchip_alloc() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This prepares the pwm-lpss drivers to further changes of the pwm core outlined in the commit introducing devm_pwmchip_alloc(). There is no intended semantical change and the driver should behave as before. Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/b567ab5dd992e361eb884fa6c2cac11be9c7dde3.1707900770.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- include/linux/platform_data/x86/pwm-lpss.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pwm-lpss.h b/include/linux/platform_data/x86/pwm-lpss.h index c852fe24fe2a..752c06b47cc8 100644 --- a/include/linux/platform_data/x86/pwm-lpss.h +++ b/include/linux/platform_data/x86/pwm-lpss.h @@ -27,7 +27,7 @@ struct pwm_lpss_boardinfo { bool other_devices_aml_touches_pwm_regs; }; -struct pwm_lpss_chip *devm_pwm_lpss_probe(struct device *dev, void __iomem *base, - const struct pwm_lpss_boardinfo *info); +struct pwm_chip *devm_pwm_lpss_probe(struct device *dev, void __iomem *base, + const struct pwm_lpss_boardinfo *info); #endif /* __PLATFORM_DATA_X86_PWM_LPSS_H */ -- cgit v1.2.3 From f70405afc99b1e5a3a1e60b6c05456fde2dbe622 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 19 Feb 2024 15:41:11 +0000 Subject: locking: Add rwsem_assert_held() and rwsem_assert_held_write() Modelled after lockdep_assert_held() and lockdep_assert_held_write(), but are always active, even when lockdep is disabled. Of course, they don't test that _this_ thread is the owner, but it's sufficient to catch many bugs and doesn't incur the same performance penalty as lockdep. Acked-by: "Peter Zijlstra (Intel)" Acked-by: Waiman Long Acked-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- include/linux/rwbase_rt.h | 9 +++++++-- include/linux/rwsem.h | 46 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h index 1d264dd08625..29c4e4f243e4 100644 --- a/include/linux/rwbase_rt.h +++ b/include/linux/rwbase_rt.h @@ -26,12 +26,17 @@ struct rwbase_rt { } while (0) -static __always_inline bool rw_base_is_locked(struct rwbase_rt *rwb) +static __always_inline bool rw_base_is_locked(const struct rwbase_rt *rwb) { return atomic_read(&rwb->readers) != READER_BIAS; } -static __always_inline bool rw_base_is_contended(struct rwbase_rt *rwb) +static inline void rw_base_assert_held_write(const struct rwbase_rt *rwb) +{ + WARN_ON(atomic_read(&rwb->readers) != WRITER_BIAS); +} + +static __always_inline bool rw_base_is_contended(const struct rwbase_rt *rwb) { return atomic_read(&rwb->readers) > 0; } diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9c29689ff505..4f1c18992f76 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -66,14 +66,24 @@ struct rw_semaphore { #endif }; -/* In all implementations count != 0 means locked */ +#define RWSEM_UNLOCKED_VALUE 0UL +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) + static inline int rwsem_is_locked(struct rw_semaphore *sem) { - return atomic_long_read(&sem->count) != 0; + return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE; } -#define RWSEM_UNLOCKED_VALUE 0L -#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) +static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE); +} + +static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED)); +} /* Common initializer macros and functions */ @@ -152,11 +162,21 @@ do { \ __init_rwsem((sem), #sem, &__key); \ } while (0) -static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) +static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) { return rw_base_is_locked(&sem->rwbase); } +static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(!rwsem_is_locked(sem)); +} + +static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +{ + rw_base_assert_held_write(sem); +} + static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) { return rw_base_is_contended(&sem->rwbase); @@ -169,6 +189,22 @@ static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) * the RT specific variant. */ +static inline void rwsem_assert_held(const struct rw_semaphore *sem) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + lockdep_assert_held(sem); + else + rwsem_assert_held_nolockdep(sem); +} + +static inline void rwsem_assert_held_write(const struct rw_semaphore *sem) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + lockdep_assert_held_write(sem); + else + rwsem_assert_held_write_nolockdep(sem); +} + /* * lock for reading */ -- cgit v1.2.3 From b57b4126dd3bb69db876ae7b271307ab7e0458b9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Feb 2024 12:39:30 +0300 Subject: smp: Make __smp_processor_id() 0-argument macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit smp_processor_id family of macros never accepted any arguments. #define __smp_processor_id(x) works by accident (see C99 6.10.3 §4). __smp_processor_id() gets 1 (empty) argument and passes it down to raw_smp_processor_id() which doesn't accept arguments. Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/0037d1f2-8153-4b33-b43e-f4b6ecd710ac@p183 --- include/linux/smp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index e87520dc2959..cc517002c599 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -261,7 +261,7 @@ static inline int get_boot_cpu_id(void) * regular asm read for the stable. */ #ifndef __smp_processor_id -#define __smp_processor_id(x) raw_smp_processor_id(x) +#define __smp_processor_id() raw_smp_processor_id() #endif #ifdef CONFIG_DEBUG_PREEMPT -- cgit v1.2.3 From fb700810d30b9eb333a7bf447012e1158e35c62f Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 13 Feb 2024 18:44:38 +0000 Subject: x86/resctrl: Separate arch and fs resctrl locks resctrl has one mutex that is taken by the architecture-specific code, and the filesystem parts. The two interact via cpuhp, where the architecture code updates the domain list. Filesystem handlers that walk the domains list should not run concurrently with the cpuhp callback modifying the list. Exposing a lock from the filesystem code means the interface is not cleanly defined, and creates the possibility of cross-architecture lock ordering headaches. The interaction only exists so that certain filesystem paths are serialised against CPU hotplug. The CPU hotplug code already has a mechanism to do this using cpus_read_lock(). MPAM's monitors have an overflow interrupt, so it needs to be possible to walk the domains list in irq context. RCU is ideal for this, but some paths need to be able to sleep to allocate memory. Because resctrl_{on,off}line_cpu() take the rdtgroup_mutex as part of a cpuhp callback, cpus_read_lock() must always be taken first. rdtgroup_schemata_write() already does this. Most of the filesystem code's domain list walkers are currently protected by the rdtgroup_mutex taken in rdtgroup_kn_lock_live(). The exceptions are rdt_bit_usage_show() and the mon_config helpers which take the lock directly. Make the domain list protected by RCU. An architecture-specific lock prevents concurrent writers. rdt_bit_usage_show() could walk the domain list using RCU, but to keep all the filesystem operations the same, this is changed to call cpus_read_lock(). The mon_config helpers send multiple IPIs, take the cpus_read_lock() in these cases. The other filesystem list walkers need to be able to sleep. Add cpus_read_lock() to rdtgroup_kn_lock_live() so that the cpuhp callbacks can't be invoked when file system operations are occurring. Add lockdep_assert_cpus_held() in the cases where the rdtgroup_kn_lock_live() call isn't obvious. Resctrl's domain online/offline calls now need to take the rdtgroup_mutex themselves. [ bp: Fold in a build fix: https://lore.kernel.org/r/87zfvwieli.ffs@tglx ] Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Babu Moger Tested-by: Carl Worth # arm64 Link: https://lore.kernel.org/r/20240213184438.16675-25-james.morse@arm.com Signed-off-by: Borislav Petkov (AMD) --- include/linux/resctrl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 270ff1d5c051..a365f67131ec 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -159,7 +159,7 @@ struct resctrl_schema; * @cache_level: Which cache level defines scope of this resource * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. - * @domains: All domains for this resource + * @domains: RCU list of all domains for this resource * @name: Name to use in "schemata" file. * @data_width: Character width of data when displaying * @default_ctrl: Specifies default cache cbm or memory B/W percent. -- cgit v1.2.3 From 9c446288d7b31402adb454535cb2c3cbdb55bb88 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Sat, 10 Feb 2024 21:57:16 +0100 Subject: iio: buffer-dmaengine: export buffer alloc and free functions Export iio_dmaengine_buffer_free() and iio_dmaengine_buffer_alloc(). This is in preparation of introducing IIO backends support. This will allow us to allocate a buffer and control it's lifetime from a device different from the one holding the DMA firmware properties. Effectively, in this case the struct device holding the firmware information about the DMA channels is not the same as iio_dev->dev.parent (typical case). While at it, namespace the buffer-dmaengine exports and update the current user of these buffers. Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240210-iio-backend-v11-4-f5242a5fb42a@analog.com Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dmaengine.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h index 5c355be89814..cbb8ba957fad 100644 --- a/include/linux/iio/buffer-dmaengine.h +++ b/include/linux/iio/buffer-dmaengine.h @@ -10,6 +10,9 @@ struct iio_dev; struct device; +struct iio_buffer *iio_dmaengine_buffer_alloc(struct device *dev, + const char *channel); +void iio_dmaengine_buffer_free(struct iio_buffer *buffer); int devm_iio_dmaengine_buffer_setup(struct device *dev, struct iio_dev *indio_dev, const char *channel); -- cgit v1.2.3 From 1a97905d3e48ebe79a06d16143fbfa427c56ce5f Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Sat, 10 Feb 2024 21:57:17 +0100 Subject: iio: add the IIO backend framework This is a Framework to handle complex IIO aggregate devices. The typical architecture is to have one device as the frontend device which can be "linked" against one or multiple backend devices. All the IIO and userspace interface is expected to be registers/managed by the frontend device which will callback into the backends when needed (to get/set some configuration that it does not directly control). The basic framework interface is pretty simple: - Backends should register themselves with @devm_iio_backend_register() - Frontend devices should get backends with @devm_iio_backend_get() Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240210-iio-backend-v11-5-f5242a5fb42a@analog.com Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 72 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 include/linux/iio/backend.h (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h new file mode 100644 index 000000000000..a6d79381866e --- /dev/null +++ b/include/linux/iio/backend.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _IIO_BACKEND_H_ +#define _IIO_BACKEND_H_ + +#include + +struct fwnode_handle; +struct iio_backend; +struct device; +struct iio_dev; + +enum iio_backend_data_type { + IIO_BACKEND_TWOS_COMPLEMENT, + IIO_BACKEND_OFFSET_BINARY, + IIO_BACKEND_DATA_TYPE_MAX +}; + +/** + * struct iio_backend_data_fmt - Backend data format + * @type: Data type. + * @sign_extend: Bool to tell if the data is sign extended. + * @enable: Enable/Disable the data format module. If disabled, + * not formatting will happen. + */ +struct iio_backend_data_fmt { + enum iio_backend_data_type type; + bool sign_extend; + bool enable; +}; + +/** + * struct iio_backend_ops - operations structure for an iio_backend + * @enable: Enable backend. + * @disable: Disable backend. + * @chan_enable: Enable one channel. + * @chan_disable: Disable one channel. + * @data_format_set: Configure the data format for a specific channel. + * @request_buffer: Request an IIO buffer. + * @free_buffer: Free an IIO buffer. + **/ +struct iio_backend_ops { + int (*enable)(struct iio_backend *back); + void (*disable)(struct iio_backend *back); + int (*chan_enable)(struct iio_backend *back, unsigned int chan); + int (*chan_disable)(struct iio_backend *back, unsigned int chan); + int (*data_format_set)(struct iio_backend *back, unsigned int chan, + const struct iio_backend_data_fmt *data); + struct iio_buffer *(*request_buffer)(struct iio_backend *back, + struct iio_dev *indio_dev); + void (*free_buffer)(struct iio_backend *back, + struct iio_buffer *buffer); +}; + +int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan); +int iio_backend_chan_disable(struct iio_backend *back, unsigned int chan); +int devm_iio_backend_enable(struct device *dev, struct iio_backend *back); +int iio_backend_data_format_set(struct iio_backend *back, unsigned int chan, + const struct iio_backend_data_fmt *data); +int devm_iio_backend_request_buffer(struct device *dev, + struct iio_backend *back, + struct iio_dev *indio_dev); + +void *iio_backend_get_priv(const struct iio_backend *conv); +struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name); +struct iio_backend * +__devm_iio_backend_get_from_fwnode_lookup(struct device *dev, + struct fwnode_handle *fwnode); + +int devm_iio_backend_register(struct device *dev, + const struct iio_backend_ops *ops, void *priv); + +#endif -- cgit v1.2.3 From 794ef0e57854d794173c8ab6bcce3285032dcd95 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Sat, 10 Feb 2024 21:57:19 +0100 Subject: iio: adc: adi-axi-adc: move to backend framework Move to the IIO backend framework. Devices supported by adi-axi-adc now register themselves as backend devices. Signed-off-by: Nuno Sa Link: https://lore.kernel.org/r/20240210-iio-backend-v11-7-f5242a5fb42a@analog.com Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/adi-axi-adc.h | 68 ------------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 include/linux/iio/adc/adi-axi-adc.h (limited to 'include/linux') diff --git a/include/linux/iio/adc/adi-axi-adc.h b/include/linux/iio/adc/adi-axi-adc.h deleted file mode 100644 index b7904992d561..000000000000 --- a/include/linux/iio/adc/adi-axi-adc.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Analog Devices Generic AXI ADC IP core driver/library - * Link: https://wiki.analog.com/resources/fpga/docs/axi_adc_ip - * - * Copyright 2012-2020 Analog Devices Inc. - */ -#ifndef __ADI_AXI_ADC_H__ -#define __ADI_AXI_ADC_H__ - -struct device; -struct iio_chan_spec; - -/** - * struct adi_axi_adc_chip_info - Chip specific information - * @name Chip name - * @id Chip ID (usually product ID) - * @channels Channel specifications of type @struct iio_chan_spec - * @num_channels Number of @channels - * @scale_table Supported scales by the chip; tuples of 2 ints - * @num_scales Number of scales in the table - * @max_rate Maximum sampling rate supported by the device - */ -struct adi_axi_adc_chip_info { - const char *name; - unsigned int id; - - const struct iio_chan_spec *channels; - unsigned int num_channels; - - const unsigned int (*scale_table)[2]; - int num_scales; - - unsigned long max_rate; -}; - -/** - * struct adi_axi_adc_conv - data of the ADC attached to the AXI ADC - * @chip_info chip info details for the client ADC - * @preenable_setup op to run in the client before enabling the AXI ADC - * @reg_access IIO debugfs_reg_access hook for the client ADC - * @read_raw IIO read_raw hook for the client ADC - * @write_raw IIO write_raw hook for the client ADC - * @read_avail IIO read_avail hook for the client ADC - */ -struct adi_axi_adc_conv { - const struct adi_axi_adc_chip_info *chip_info; - - int (*preenable_setup)(struct adi_axi_adc_conv *conv); - int (*reg_access)(struct adi_axi_adc_conv *conv, unsigned int reg, - unsigned int writeval, unsigned int *readval); - int (*read_raw)(struct adi_axi_adc_conv *conv, - struct iio_chan_spec const *chan, - int *val, int *val2, long mask); - int (*write_raw)(struct adi_axi_adc_conv *conv, - struct iio_chan_spec const *chan, - int val, int val2, long mask); - int (*read_avail)(struct adi_axi_adc_conv *conv, - struct iio_chan_spec const *chan, - const int **val, int *type, int *length, long mask); -}; - -struct adi_axi_adc_conv *devm_adi_axi_adc_conv_register(struct device *dev, - size_t sizeof_priv); - -void *adi_axi_adc_conv_priv(struct adi_axi_adc_conv *conv); - -#endif -- cgit v1.2.3 From 74fa8f9c553f7b5ccab7d103acae63cc2e080465 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 08:10:47 +0100 Subject: block: pass a queue_limits argument to blk_alloc_disk Pass a queue_limits to blk_alloc_disk and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Also change blk_alloc_disk to return an ERR_PTR instead of just NULL which can't distinguish errors. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20240215071055.2201424-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 45746ba73670..a14ea9344138 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -766,22 +766,26 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure + * @lim: queue limits to be used for this disk. * @node_id: numa node to allocate on * * Allocate and pre-initialize a gendisk structure for use with BIO based * drivers. * + * Returns an ERR_PTR on error, else the allocated disk. + * * Context: can sleep */ -#define blk_alloc_disk(node_id) \ +#define blk_alloc_disk(lim, node_id) \ ({ \ static struct lock_class_key __key; \ \ - __blk_alloc_disk(node_id, &__key); \ + __blk_alloc_disk(lim, node_id, &__key); \ }) int __register_blkdev(unsigned int major, const char *name, -- cgit v1.2.3 From dc36561e1548a8ca93b34ef385da03c289ec5ac0 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sun, 21 Jan 2024 19:09:00 +0800 Subject: firmware: arm_scmi: Implement clock get permissions ARM SCMI v3.2 introduces clock get permission command. To implement the same let us stash the values of those permissions in the scmi_clock_info. They indicate if the operation is forbidden or not. If the CLOCK_GET_PERMISSIONS command is not supported, the default permissions are set to allow the operations, otherwise they will be set according to the response of CLOCK_GET_PERMISSIONS from the SCMI platform firmware. Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20240121110901.1414856-1-peng.fan@oss.nxp.com Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index f2f05fb42d28..0cc40af5519a 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -47,6 +47,9 @@ struct scmi_clock_info { bool rate_discrete; bool rate_changed_notifications; bool rate_change_requested_notifications; + bool state_ctrl_forbidden; + bool rate_ctrl_forbidden; + bool parent_ctrl_forbidden; union { struct { int num_rates; -- cgit v1.2.3 From 989e8661dc45babf43070d519011dfc1e33c8875 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Sun, 11 Feb 2024 12:51:29 -0300 Subject: firmware: arm_ffa: Make ffa_bus_type const Now that the driver core can properly handle constant struct bus_type, move the ffa_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Cristian Marussi Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240211-bus_cleanup-firmware2-v1-1-1851c92c7be7@marliere.net Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 3d0fde57ba90..c906f666ff5d 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -209,7 +209,7 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; } #define module_ffa_driver(__ffa_driver) \ module_driver(__ffa_driver, ffa_register, ffa_unregister) -extern struct bus_type ffa_bus_type; +extern const struct bus_type ffa_bus_type; /* FFA transport related */ struct ffa_partition_info { -- cgit v1.2.3 From 21d2e6737c9789aa9b23c8a4131cbca8260139fd Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 14 Feb 2024 14:34:03 -0800 Subject: net: add netmem to skb_frag_t Use struct netmem* instead of page in skb_frag_t. Currently struct netmem* is always a struct page underneath, but the abstraction allows efforts to add support for skb frags not backed by pages. There is unfortunately 1 instance where the skb_frag_t is assumed to be a exactly a bio_vec in kcm. For this case, WARN_ON_ONCE and return error before doing a cast. Add skb[_frag]_fill_netmem_*() and skb_add_rx_frag_netmem() helpers so that the API can be used to create netmem skbs. Signed-off-by: Mina Almasry Acked-by: Paolo Abeni Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 100 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 696e7680656f..e3a2ed5d09ad 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -37,6 +37,7 @@ #endif #include #include +#include /** * DOC: skb checksums @@ -359,7 +360,11 @@ extern int sysctl_max_skb_frags; */ #define GSO_BY_FRAGS 0xFFFF -typedef struct bio_vec skb_frag_t; +typedef struct skb_frag { + netmem_ref netmem; + unsigned int len; + unsigned int offset; +} skb_frag_t; /** * skb_frag_size() - Returns the size of a skb fragment @@ -367,7 +372,7 @@ typedef struct bio_vec skb_frag_t; */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) { - return frag->bv_len; + return frag->len; } /** @@ -377,7 +382,7 @@ static inline unsigned int skb_frag_size(const skb_frag_t *frag) */ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) { - frag->bv_len = size; + frag->len = size; } /** @@ -387,7 +392,7 @@ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) */ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) { - frag->bv_len += delta; + frag->len += delta; } /** @@ -397,7 +402,7 @@ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) */ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) { - frag->bv_len -= delta; + frag->len -= delta; } /** @@ -417,7 +422,7 @@ static inline bool skb_frag_must_loop(struct page *p) * skb_frag_foreach_page - loop over pages in a fragment * * @f: skb frag to operate on - * @f_off: offset from start of f->bv_page + * @f_off: offset from start of f->netmem * @f_len: length from f_off to loop over * @p: (temp var) current page * @p_off: (temp var) offset from start of current page, @@ -2429,22 +2434,37 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) return skb_headlen(skb) + __skb_pagelen(skb); } +static inline void skb_frag_fill_netmem_desc(skb_frag_t *frag, + netmem_ref netmem, int off, + int size) +{ + frag->netmem = netmem; + frag->offset = off; + skb_frag_size_set(frag, size); +} + static inline void skb_frag_fill_page_desc(skb_frag_t *frag, struct page *page, int off, int size) { - frag->bv_page = page; - frag->bv_offset = off; - skb_frag_size_set(frag, size); + skb_frag_fill_netmem_desc(frag, page_to_netmem(page), off, size); +} + +static inline void __skb_fill_netmem_desc_noacc(struct skb_shared_info *shinfo, + int i, netmem_ref netmem, + int off, int size) +{ + skb_frag_t *frag = &shinfo->frags[i]; + + skb_frag_fill_netmem_desc(frag, netmem, off, size); } static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, int i, struct page *page, int off, int size) { - skb_frag_t *frag = &shinfo->frags[i]; - - skb_frag_fill_page_desc(frag, page, off, size); + __skb_fill_netmem_desc_noacc(shinfo, i, page_to_netmem(page), off, + size); } /** @@ -2460,10 +2480,10 @@ static inline void skb_len_add(struct sk_buff *skb, int delta) } /** - * __skb_fill_page_desc - initialise a paged fragment in an skb + * __skb_fill_netmem_desc - initialise a fragment in an skb * @skb: buffer containing fragment to be initialised - * @i: paged fragment index to initialise - * @page: the page to use for this fragment + * @i: fragment index to initialise + * @netmem: the netmem to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * @@ -2472,10 +2492,12 @@ static inline void skb_len_add(struct sk_buff *skb, int delta) * * Does not take any additional reference on the fragment. */ -static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, - struct page *page, int off, int size) +static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i, + netmem_ref netmem, int off, int size) { - __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); + struct page *page = netmem_to_page(netmem); + + __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size); /* Propagate page pfmemalloc to the skb if we can. The problem is * that not all callers have unique ownership of the page but rely @@ -2483,7 +2505,20 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, */ page = compound_head(page); if (page_is_pfmemalloc(page)) - skb->pfmemalloc = true; + skb->pfmemalloc = true; +} + +static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, + struct page *page, int off, int size) +{ + __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size); +} + +static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i, + netmem_ref netmem, int off, int size) +{ + __skb_fill_netmem_desc(skb, i, netmem, off, size); + skb_shinfo(skb)->nr_frags = i + 1; } /** @@ -2503,8 +2538,7 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, static inline void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { - __skb_fill_page_desc(skb, i, page, off, size); - skb_shinfo(skb)->nr_frags = i + 1; + skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size); } /** @@ -2528,8 +2562,16 @@ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, shinfo->nr_frags = i + 1; } -void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, - int size, unsigned int truesize); +void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, + int off, int size, unsigned int truesize); + +static inline void skb_add_rx_frag(struct sk_buff *skb, int i, + struct page *page, int off, int size, + unsigned int truesize) +{ + skb_add_rx_frag_netmem(skb, i, page_to_netmem(page), off, size, + truesize); +} void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize); @@ -3378,7 +3420,7 @@ static inline void skb_propagate_pfmemalloc(const struct page *page, */ static inline unsigned int skb_frag_off(const skb_frag_t *frag) { - return frag->bv_offset; + return frag->offset; } /** @@ -3388,7 +3430,7 @@ static inline unsigned int skb_frag_off(const skb_frag_t *frag) */ static inline void skb_frag_off_add(skb_frag_t *frag, int delta) { - frag->bv_offset += delta; + frag->offset += delta; } /** @@ -3398,7 +3440,7 @@ static inline void skb_frag_off_add(skb_frag_t *frag, int delta) */ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) { - frag->bv_offset = offset; + frag->offset = offset; } /** @@ -3409,7 +3451,7 @@ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) static inline void skb_frag_off_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { - fragto->bv_offset = fragfrom->bv_offset; + fragto->offset = fragfrom->offset; } /** @@ -3420,7 +3462,7 @@ static inline void skb_frag_off_copy(skb_frag_t *fragto, */ static inline struct page *skb_frag_page(const skb_frag_t *frag) { - return frag->bv_page; + return netmem_to_page(frag->netmem); } /** @@ -3528,7 +3570,7 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag) static inline void skb_frag_page_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { - fragto->bv_page = fragfrom->bv_page; + fragto->netmem = fragfrom->netmem; } bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); -- cgit v1.2.3 From ddb9fd7a544088ed70eccbb9f85e9cc9952131c1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Feb 2024 21:23:34 +0100 Subject: fs/select: rework stack allocation hack for clang A while ago, we changed the way that select() and poll() preallocate a temporary buffer just under the size of the static warning limit of 1024 bytes, as clang was frequently going slightly above that limit. The warnings have recently returned and I took another look. As it turns out, clang is not actually inherently worse at reserving stack space, it just happens to inline do_select() into core_sys_select(), while gcc never inlines it. Annotate do_select() to never be inlined and in turn remove the special case for the allocation size. This should give the same behavior for both clang and gcc all the time and once more avoids those warnings. Fixes: ad312f95d41c ("fs/select: avoid clang stack usage warning") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240216202352.2492798-1-arnd@kernel.org Reviewed-by: Kees Cook Reviewed-by: Andi Kleen Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/poll.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poll.h b/include/linux/poll.h index a9e0e1c2d1f2..d1ea4f3714a8 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -14,11 +14,7 @@ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ -#ifdef __clang__ -#define MAX_STACK_ALLOC 768 -#else #define MAX_STACK_ALLOC 832 -#endif #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC -- cgit v1.2.3 From 6dfee110c6cc7a6c3c1f45a07428c15820b87c1d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 9 Feb 2024 12:40:10 +0000 Subject: locking/atomic: scripts: Clarify ordering of conditional atomics Conditional atomic operations (e.g. cmpxchg()) only provide ordering when the condition holds; when the condition does not hold, the location is not modified and relaxed ordering is provided. Where ordering is needed for failed conditional atomics, it is necessary to use smp_mb__before_atomic() and/or smp_mb__after_atomic(). This is explained tersely in memory-barriers.txt, and is implied but not explicitly stated in the kerneldoc comments for the conditional operations. The lack of an explicit statement has lead to some off-list queries about the ordering semantics of failing conditional operations, so evidently this is confusing. Update the kerneldoc comments to explicitly describe the lack of ordering for failed conditional atomic operations. For most conditional atomic operations, this is written as: | If (${condition}), atomically updates @v to (${new}) with ${desc_order} ordering. | Otherwise, @v is not modified and relaxed ordering is provided. For the try_cmpxchg() operations, this is written as: | If (${condition}), atomically updates @v to @new with ${desc_order} ordering. | Otherwise, @v is not modified, @old is updated to the current value of @v, | and relaxed ordering is provided. Signed-off-by: Mark Rutland Signed-off-by: Ingo Molnar Reviewed-by: Paul E. McKenney Reviewed-by: Nhat Pham Link: https://lore.kernel.org/r/20240209124010.2096198-1-mark.rutland@arm.com --- include/linux/atomic/atomic-arch-fallback.h | 46 +++++++++++++++---- include/linux/atomic/atomic-instrumented.h | 68 +++++++++++++++++++++++------ include/linux/atomic/atomic-long.h | 24 +++++++--- 3 files changed, 111 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index 5e95faa959c4..956bcba5dbf2 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -2005,6 +2005,7 @@ raw_atomic_xchg_relaxed(atomic_t *v, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere. * @@ -2033,6 +2034,7 @@ raw_atomic_cmpxchg(atomic_t *v, int old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere. * @@ -2061,6 +2063,7 @@ raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere. * @@ -2088,6 +2091,7 @@ raw_atomic_cmpxchg_release(atomic_t *v, int old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere. * @@ -2112,7 +2116,8 @@ raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere. * @@ -2145,7 +2150,8 @@ raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere. * @@ -2178,7 +2184,8 @@ raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere. * @@ -2210,7 +2217,8 @@ raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere. * @@ -2403,6 +2411,7 @@ raw_atomic_add_negative_relaxed(int i, atomic_t *v) * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere. * @@ -2432,6 +2441,7 @@ raw_atomic_fetch_add_unless(atomic_t *v, int a, int u) * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere. * @@ -2452,6 +2462,7 @@ raw_atomic_add_unless(atomic_t *v, int a, int u) * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere. * @@ -2472,6 +2483,7 @@ raw_atomic_inc_not_zero(atomic_t *v) * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere. * @@ -2499,6 +2511,7 @@ raw_atomic_inc_unless_negative(atomic_t *v) * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere. * @@ -2526,6 +2539,7 @@ raw_atomic_dec_unless_positive(atomic_t *v) * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere. * @@ -4117,6 +4131,7 @@ raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere. * @@ -4145,6 +4160,7 @@ raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere. * @@ -4173,6 +4189,7 @@ raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere. * @@ -4200,6 +4217,7 @@ raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere. * @@ -4224,7 +4242,8 @@ raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere. * @@ -4257,7 +4276,8 @@ raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere. * @@ -4290,7 +4310,8 @@ raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere. * @@ -4322,7 +4343,8 @@ raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere. * @@ -4515,6 +4537,7 @@ raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere. * @@ -4544,6 +4567,7 @@ raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere. * @@ -4564,6 +4588,7 @@ raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere. * @@ -4584,6 +4609,7 @@ raw_atomic64_inc_not_zero(atomic64_t *v) * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere. * @@ -4611,6 +4637,7 @@ raw_atomic64_inc_unless_negative(atomic64_t *v) * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere. * @@ -4638,6 +4665,7 @@ raw_atomic64_dec_unless_positive(atomic64_t *v) * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere. * @@ -4662,4 +4690,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v) } #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// eec048affea735b8464f58e6d96992101f8f85f1 +// 14850c0b0db20c62fdc78ccd1d42b98b88d76331 diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index 54d7bbe0aeaa..debd487fe971 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -1182,6 +1182,7 @@ atomic_xchg_relaxed(atomic_t *v, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there. * @@ -1202,6 +1203,7 @@ atomic_cmpxchg(atomic_t *v, int old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there. * @@ -1221,6 +1223,7 @@ atomic_cmpxchg_acquire(atomic_t *v, int old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there. * @@ -1241,6 +1244,7 @@ atomic_cmpxchg_release(atomic_t *v, int old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there. * @@ -1260,7 +1264,8 @@ atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there. * @@ -1282,7 +1287,8 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there. * @@ -1303,7 +1309,8 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there. * @@ -1325,7 +1332,8 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) * @new: int value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there. * @@ -1475,6 +1483,7 @@ atomic_add_negative_relaxed(int i, atomic_t *v) * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there. * @@ -1495,6 +1504,7 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u) * @u: int value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_add_unless() there. * @@ -1513,6 +1523,7 @@ atomic_add_unless(atomic_t *v, int a, int u) * @v: pointer to atomic_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there. * @@ -1531,6 +1542,7 @@ atomic_inc_not_zero(atomic_t *v) * @v: pointer to atomic_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there. * @@ -1549,6 +1561,7 @@ atomic_inc_unless_negative(atomic_t *v) * @v: pointer to atomic_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there. * @@ -1567,6 +1580,7 @@ atomic_dec_unless_positive(atomic_t *v) * @v: pointer to atomic_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there. * @@ -2746,6 +2760,7 @@ atomic64_xchg_relaxed(atomic64_t *v, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there. * @@ -2766,6 +2781,7 @@ atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there. * @@ -2785,6 +2801,7 @@ atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there. * @@ -2805,6 +2822,7 @@ atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there. * @@ -2824,7 +2842,8 @@ atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there. * @@ -2846,7 +2865,8 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there. * @@ -2867,7 +2887,8 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there. * @@ -2889,7 +2910,8 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) * @new: s64 value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there. * @@ -3039,6 +3061,7 @@ atomic64_add_negative_relaxed(s64 i, atomic64_t *v) * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there. * @@ -3059,6 +3082,7 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) * @u: s64 value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there. * @@ -3077,6 +3101,7 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u) * @v: pointer to atomic64_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there. * @@ -3095,6 +3120,7 @@ atomic64_inc_not_zero(atomic64_t *v) * @v: pointer to atomic64_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there. * @@ -3113,6 +3139,7 @@ atomic64_inc_unless_negative(atomic64_t *v) * @v: pointer to atomic64_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there. * @@ -3131,6 +3158,7 @@ atomic64_dec_unless_positive(atomic64_t *v) * @v: pointer to atomic64_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there. * @@ -4310,6 +4338,7 @@ atomic_long_xchg_relaxed(atomic_long_t *v, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there. * @@ -4330,6 +4359,7 @@ atomic_long_cmpxchg(atomic_long_t *v, long old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there. * @@ -4349,6 +4379,7 @@ atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there. * @@ -4369,6 +4400,7 @@ atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there. * @@ -4388,7 +4420,8 @@ atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there. * @@ -4410,7 +4443,8 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there. * @@ -4431,7 +4465,8 @@ atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there. * @@ -4453,7 +4488,8 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there. * @@ -4603,6 +4639,7 @@ atomic_long_add_negative_relaxed(long i, atomic_long_t *v) * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there. * @@ -4623,6 +4660,7 @@ atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there. * @@ -4641,6 +4679,7 @@ atomic_long_add_unless(atomic_long_t *v, long a, long u) * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there. * @@ -4659,6 +4698,7 @@ atomic_long_inc_not_zero(atomic_long_t *v) * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there. * @@ -4677,6 +4717,7 @@ atomic_long_inc_unless_negative(atomic_long_t *v) * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there. * @@ -4695,6 +4736,7 @@ atomic_long_dec_unless_positive(atomic_long_t *v) * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there. * @@ -5008,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 2cc4bc990fef44d3836ec108f11b610f3f438184 +// ce5b65e0f1f8a276268b667194581d24bed219d4 diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h index c82947170ddc..3ef844b3ab8a 100644 --- a/include/linux/atomic/atomic-long.h +++ b/include/linux/atomic/atomic-long.h @@ -1352,6 +1352,7 @@ raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere. * @@ -1374,6 +1375,7 @@ raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere. * @@ -1396,6 +1398,7 @@ raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere. * @@ -1418,6 +1421,7 @@ raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere. * @@ -1440,7 +1444,8 @@ raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with full ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere. * @@ -1463,7 +1468,8 @@ raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with acquire ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere. * @@ -1486,7 +1492,8 @@ raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with release ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere. * @@ -1509,7 +1516,8 @@ raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) * @new: long value to assign * * If (@v == @old), atomically updates @v to @new with relaxed ordering. - * Otherwise, updates @old to the current value of @v. + * Otherwise, @v is not modified, @old is updated to the current value of @v, + * and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere. * @@ -1677,6 +1685,7 @@ raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere. * @@ -1699,6 +1708,7 @@ raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) * @u: long value to compare with * * If (@v != @u), atomically updates @v to (@v + @a) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere. * @@ -1719,6 +1729,7 @@ raw_atomic_long_add_unless(atomic_long_t *v, long a, long u) * @v: pointer to atomic_long_t * * If (@v != 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere. * @@ -1739,6 +1750,7 @@ raw_atomic_long_inc_not_zero(atomic_long_t *v) * @v: pointer to atomic_long_t * * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere. * @@ -1759,6 +1771,7 @@ raw_atomic_long_inc_unless_negative(atomic_long_t *v) * @v: pointer to atomic_long_t * * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere. * @@ -1779,6 +1792,7 @@ raw_atomic_long_dec_unless_positive(atomic_long_t *v) * @v: pointer to atomic_long_t * * If (@v > 0), atomically updates @v to (@v - 1) with full ordering. + * Otherwise, @v is not modified and relaxed ordering is provided. * * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere. * @@ -1795,4 +1809,4 @@ raw_atomic_long_dec_if_positive(atomic_long_t *v) } #endif /* _LINUX_ATOMIC_LONG_H */ -// 4ef23f98c73cff96d239896175fd26b10b88899e +// 1c4a26fc77f345342953770ebe3c4d08e7ce2f9a -- cgit v1.2.3 From 219eee9c0d16f1b754a8b85275854ab17df0850a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 16 Feb 2024 12:36:57 +0100 Subject: net: skbuff: add overflow debug check to pull/push helpers syzbot managed to trigger following splat: BUG: KASAN: use-after-free in __skb_flow_dissect+0x4a3b/0x5e50 Read of size 1 at addr ffff888208a4000e by task a.out/2313 [..] __skb_flow_dissect+0x4a3b/0x5e50 __skb_get_hash+0xb4/0x400 ip_tunnel_xmit+0x77e/0x26f0 ipip_tunnel_xmit+0x298/0x410 .. Analysis shows that the skb has a valid ->head, but bogus ->data pointer. skb->data gets its bogus value via the neigh layer, which does: 1556 __skb_pull(skb, skb_network_offset(skb)); ... and the skb was already dodgy at this point: skb_network_offset(skb) returns a negative value due to an earlier overflow of skb->network_header (u16). __skb_pull thus "adjusts" skb->data by a huge offset, pointing outside skb->head area. Allow debug builds to splat when we try to pull/push more than INT_MAX bytes. After this, the syzkaller reproducer yields a more precise splat before the flow dissector attempts to read off skb->data memory: WARNING: CPU: 5 PID: 2313 at include/linux/skbuff.h:2653 neigh_connected_output+0x28e/0x400 ip_finish_output2+0xb25/0xed0 iptunnel_xmit+0x4ff/0x870 ipgre_xmit+0x78e/0xbb0 Signed-off-by: Florian Westphal Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240216113700.23013-1-fw@strlen.de Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e3a2ed5d09ad..28c7cb7ce251 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2684,6 +2684,8 @@ static inline void skb_put_u8(struct sk_buff *skb, u8 val) void *skb_push(struct sk_buff *skb, unsigned int len); static inline void *__skb_push(struct sk_buff *skb, unsigned int len) { + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb->data -= len; skb->len += len; return skb->data; @@ -2692,6 +2694,8 @@ static inline void *__skb_push(struct sk_buff *skb, unsigned int len) void *skb_pull(struct sk_buff *skb, unsigned int len); static inline void *__skb_pull(struct sk_buff *skb, unsigned int len) { + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb->len -= len; if (unlikely(skb->len < skb->data_len)) { #if defined(CONFIG_DEBUG_NET) @@ -2716,6 +2720,8 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta); static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; -- cgit v1.2.3 From 22ffc748a6475e75e058ecb16c5afdd6b9f1885f Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Mon, 12 Feb 2024 12:32:33 +0000 Subject: firmware: arm_scmi: Report frequencies in the perf notifications Extend the perf notification report to include pre-calculated frequencies corresponding to the reported limits/levels event; such frequencies are properly computed based on the stored known OPPs information taking into consideration if the current operating mode is level indexed or not. Signed-off-by: Cristian Marussi Link: https://lore.kernel.org/r/20240212123233.1230090-12-cristian.marussi@arm.com Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 0cc40af5519a..9b9351e07a11 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -956,6 +956,8 @@ struct scmi_perf_limits_report { unsigned int domain_id; unsigned int range_max; unsigned int range_min; + unsigned long range_max_freq; + unsigned long range_min_freq; }; struct scmi_perf_level_report { @@ -963,6 +965,7 @@ struct scmi_perf_level_report { unsigned int agent_id; unsigned int domain_id; unsigned int performance_level; + unsigned long performance_level_freq; }; struct scmi_sensor_trip_point_report { -- cgit v1.2.3 From aaf2230036b709f979caccfbaa7a8bf671c22124 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 13 Feb 2024 11:45:25 -0300 Subject: tc: make tc_bus_type const Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the tc_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Acked-by: Maciej W. Rozycki Reviewed-by: Greg Kroah-Hartman Signed-off-by: Thomas Bogendoerfer --- include/linux/tc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tc.h b/include/linux/tc.h index a60639f37963..1638660abf5e 100644 --- a/include/linux/tc.h +++ b/include/linux/tc.h @@ -120,7 +120,7 @@ static inline unsigned long tc_get_speed(struct tc_bus *tbus) #ifdef CONFIG_TC -extern struct bus_type tc_bus_type; +extern const struct bus_type tc_bus_type; extern int tc_register_driver(struct tc_driver *tdrv); extern void tc_unregister_driver(struct tc_driver *tdrv); -- cgit v1.2.3 From 78b74638eb6dffd9b24bc3b121556a9039292df6 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 15 Feb 2024 15:28:59 +0000 Subject: KVM: pfncache: add a mark-dirty helper At the moment pages are marked dirty by open-coded calls to mark_page_dirty_in_slot(), directly deferefencing the gpa and memslot from the cache. After a subsequent patch these may not always be set so add a helper now so that caller will protected from the need to know about this detail. Signed-off-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240215152916.1158-5-paul@xen.org [sean: decrease indentation, use gpa_to_gfn()] Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e7fd25b09b3..604ae285d9a9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1795,6 +1795,16 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) return kvm_is_error_hva(hva); } +static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) +{ + lockdep_assert_held(&gpc->lock); + + if (!gpc->memslot) + return; + + mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa)); +} + enum kvm_stat_kind { KVM_STAT_VM, KVM_STAT_VCPU, -- cgit v1.2.3 From a4bff3df51472f555ab8dea05a3d2faf4abbf199 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 15 Feb 2024 15:29:00 +0000 Subject: KVM: pfncache: remove KVM_GUEST_USES_PFN usage As noted in [1] the KVM_GUEST_USES_PFN usage flag is never set by any callers of kvm_gpc_init(), and for good reason: the implementation is incomplete/broken. And it's not clear that there will ever be a user of KVM_GUEST_USES_PFN, as coordinating vCPUs with mmu_notifier events is non-trivial. Remove KVM_GUEST_USES_PFN and all related code, e.g. dropping KVM_GUEST_USES_PFN also makes the 'vcpu' argument redundant, to avoid having to reason about broken code as __kvm_gpc_refresh() evolves. Moreover, all existing callers specify KVM_HOST_USES_PFN so the usage check in hva_to_pfn_retry() and hence the 'usage' argument to kvm_gpc_init() are also redundant. [1] https://lore.kernel.org/all/ZQiR8IpqOZrOpzHC@google.com Signed-off-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240215152916.1158-6-paul@xen.org [sean: explicitly call out that guest usage is incomplete] Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 11 +---------- include/linux/kvm_types.h | 8 -------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 604ae285d9a9..3e1c04608c67 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1319,21 +1319,12 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); * * @gpc: struct gfn_to_pfn_cache object. * @kvm: pointer to kvm instance. - * @vcpu: vCPU to be used for marking pages dirty and to be woken on - * invalidation. - * @usage: indicates if the resulting host physical PFN is used while - * the @vcpu is IN_GUEST_MODE (in which case invalidation of - * the cache from MMU notifiers---but not for KVM memslot - * changes!---will also force @vcpu to exit the guest and - * refresh the cache); and/or if the PFN used directly - * by KVM (and thus needs a kernel virtual mapping). * * This sets up a gfn_to_pfn_cache by initializing locks and assigning the * immutable attributes. Note, the cache must be zero-allocated (or zeroed by * the caller before init). */ -void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage); +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm); /** * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 9d1f7835d8c1..d93f6522b2c3 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -49,12 +49,6 @@ typedef u64 hfn_t; typedef hfn_t kvm_pfn_t; -enum pfn_cache_usage { - KVM_GUEST_USES_PFN = BIT(0), - KVM_HOST_USES_PFN = BIT(1), - KVM_GUEST_AND_HOST_USE_PFN = KVM_GUEST_USES_PFN | KVM_HOST_USES_PFN, -}; - struct gfn_to_hva_cache { u64 generation; gpa_t gpa; @@ -69,13 +63,11 @@ struct gfn_to_pfn_cache { unsigned long uhva; struct kvm_memory_slot *memslot; struct kvm *kvm; - struct kvm_vcpu *vcpu; struct list_head list; rwlock_t lock; struct mutex refresh_lock; void *khva; kvm_pfn_t pfn; - enum pfn_cache_usage usage; bool active; bool valid; }; -- cgit v1.2.3 From 9e7325acb3dc8df8a6370ebc18aef107aae17ef0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Feb 2024 15:29:03 +0000 Subject: KVM: s390: Refactor kvm_is_error_gpa() into kvm_is_gpa_in_memslot() Rename kvm_is_error_gpa() to kvm_is_gpa_in_memslot() and invert the polarity accordingly in order to (a) free up kvm_is_error_gpa() to match with kvm_is_error_{hva,page}(), and (b) to make it more obvious that the helper is doing a memslot lookup, i.e. not simply checking for INVALID_GPA. No functional change intended. Link: https://lore.kernel.org/r/20240215152916.1158-9-paul@xen.org Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3e1c04608c67..81a9d1cf91a2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1779,11 +1779,11 @@ static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) return (hpa_t)pfn << PAGE_SHIFT; } -static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa) +static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa) { unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - return kvm_is_error_hva(hva); + return !kvm_is_error_hva(hva); } static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc) -- cgit v1.2.3 From 721f5b0dda784829b833039fbb42f420b9f86575 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 15 Feb 2024 15:29:04 +0000 Subject: KVM: pfncache: allow a cache to be activated with a fixed (userspace) HVA Some pfncache pages may actually be overlays on guest memory that have a fixed HVA within the VMM. It's pointless to invalidate such cached mappings if the overlay is moved so allow a cache to be activated directly with the HVA to cater for such cases. A subsequent patch will make use of this facility. Signed-off-by: Paul Durrant Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20240215152916.1158-10-paul@xen.org Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 81a9d1cf91a2..fa070c36f98a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -148,6 +148,11 @@ static inline bool kvm_is_error_hva(unsigned long addr) #endif +static inline bool kvm_is_error_gpa(gpa_t gpa) +{ + return gpa == INVALID_GPA; +} + #define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) static inline bool is_error_page(struct page *page) @@ -1344,6 +1349,22 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm); */ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len); +/** + * kvm_gpc_activate_hva - prepare a cached kernel mapping and HPA for a given HVA. + * + * @gpc: struct gfn_to_pfn_cache object. + * @hva: userspace virtual address to map. + * @len: sanity check; the range being access must fit a single page. + * + * @return: 0 for success. + * -EINVAL for a mapping which would cross a page boundary. + * -EFAULT for an untranslatable guest physical address. + * + * The semantics of this function are the same as those of kvm_gpc_activate(). It + * merely bypasses a layer of address translation. + */ +int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long hva, unsigned long len); + /** * kvm_gpc_check - check validity of a gfn_to_pfn_cache. * @@ -1390,6 +1411,16 @@ int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len); */ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc); +static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc) +{ + return gpc->active && !kvm_is_error_gpa(gpc->gpa); +} + +static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc) +{ + return gpc->active && kvm_is_error_gpa(gpc->gpa); +} + void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 87161a2b0aed9e9b614bbf6fe8697ad560ceb0cb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 6 Feb 2024 11:21:00 -0800 Subject: f2fs: deprecate io_bits Let's deprecate an unused io_bits feature to save CPU cycles and memory. Reviewed-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 053137a0fe45..9b69c50255b2 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -40,12 +40,6 @@ #define F2FS_ENC_UTF8_12_1 1 -#define F2FS_IO_SIZE(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ -#define F2FS_IO_SIZE_KB(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits + 2) /* KB */ -#define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */ -#define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) -#define F2FS_IO_ALIGNED(sbi) (F2FS_IO_SIZE(sbi) > 1) - /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) -- cgit v1.2.3 From 6e5f0f6383b4896c7e9b943d84b136149d0f45e9 Mon Sep 17 00:00:00 2001 From: Hongyu Jin Date: Wed, 24 Jan 2024 13:35:53 +0800 Subject: dm io: Support IO priority Some IO will dispatch from kworker with different io_context settings than the submitting task, we may need to specify a priority to avoid losing priority. Add IO priority parameter to dm_io() and update all callers. Co-developed-by: Yibin Ding Signed-off-by: Yibin Ding Signed-off-by: Hongyu Jin Reviewed-by: Eric Biggers Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- include/linux/dm-io.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h index 7595142f3fc5..7b2968612b7e 100644 --- a/include/linux/dm-io.h +++ b/include/linux/dm-io.h @@ -80,7 +80,8 @@ void dm_io_client_destroy(struct dm_io_client *client); * error occurred doing io to the corresponding region. */ int dm_io(struct dm_io_request *io_req, unsigned int num_regions, - struct dm_io_region *region, unsigned int long *sync_error_bits); + struct dm_io_region *region, unsigned int long *sync_error_bits, + unsigned short ioprio); #endif /* __KERNEL__ */ #endif /* _LINUX_DM_IO_H */ -- cgit v1.2.3 From e9b2238e47cb66521381c0bf9234e979afa19d77 Mon Sep 17 00:00:00 2001 From: Hongyu Jin Date: Wed, 24 Jan 2024 13:35:54 +0800 Subject: dm bufio: Support IO priority Some IO will dispatch from kworker with different io_context settings than the submitting task, we may need to specify a priority to avoid losing priority. Add dm_bufio_read_with_ioprio() and dm_bufio_prefetch_with_ioprio() for use by bufio users to pass an ioprio other than IOPRIO_DEFAULT. Co-developed-by: Yibin Ding Signed-off-by: Yibin Ding Signed-off-by: Hongyu Jin Reviewed-by: Eric Biggers Reviewed-by: Mikulas Patocka [snitzer: introduced _with_ioprio() wrappers to reduce churn] Signed-off-by: Mike Snitzer --- include/linux/dm-bufio.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 75e7d8cbb532..d1503b815a78 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -64,6 +64,9 @@ void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start); void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp); +void *dm_bufio_read_with_ioprio(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp, unsigned short ioprio); + /* * Like dm_bufio_read, but return buffer from cache, don't read * it. If the buffer is not in the cache, return NULL. @@ -86,6 +89,10 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, void dm_bufio_prefetch(struct dm_bufio_client *c, sector_t block, unsigned int n_blocks); +void dm_bufio_prefetch_with_ioprio(struct dm_bufio_client *c, + sector_t block, unsigned int n_blocks, + unsigned short ioprio); + /* * Release a reference obtained with dm_bufio_{read,get,new}. The data * pointer and dm_buffer pointer is no longer valid after this call. -- cgit v1.2.3 From 13ddaf26be324a7f951891ecd9ccd04466d27458 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 7 Feb 2024 02:25:59 +0800 Subject: mm/swap: fix race when skipping swapcache When skipping swapcache for SWP_SYNCHRONOUS_IO, if two or more threads swapin the same entry at the same time, they get different pages (A, B). Before one thread (T0) finishes the swapin and installs page (A) to the PTE, another thread (T1) could finish swapin of page (B), swap_free the entry, then swap out the possibly modified page reusing the same entry. It breaks the pte_same check in (T0) because PTE value is unchanged, causing ABA problem. Thread (T0) will install a stalled page (A) into the PTE and cause data corruption. One possible callstack is like this: CPU0 CPU1 ---- ---- do_swap_page() do_swap_page() with same entry swap_read_folio() <- read to page A swap_read_folio() <- read to page B ... set_pte_at() swap_free() <- entry is free pte_same() <- Check pass, PTE seems unchanged, but page A is stalled! swap_free() <- page B content lost! set_pte_at() <- staled page A installed! And besides, for ZRAM, swap_free() allows the swap device to discard the entry content, so even if page (B) is not modified, if swap_read_folio() on CPU0 happens later than swap_free() on CPU1, it may also cause data loss. To fix this, reuse swapcache_prepare which will pin the swap entry using the cache flag, and allow only one thread to swap it in, also prevent any parallel code from putting the entry in the cache. Release the pin after PT unlocked. Racers just loop and wait since it's a rare and very short event. A schedule_timeout_uninterruptible(1) call is added to avoid repeated page faults wasting too much CPU, causing livelock or adding too much noise to perf statistics. A similar livelock issue was described in commit 029c4628b2eb ("mm: swap: get rid of livelock in swapin readahead") Reproducer: This race issue can be triggered easily using a well constructed reproducer and patched brd (with a delay in read path) [1]: With latest 6.8 mainline, race caused data loss can be observed easily: $ gcc -g -lpthread test-thread-swap-race.c && ./a.out Polulating 32MB of memory region... Keep swapping out... Starting round 0... Spawning 65536 workers... 32746 workers spawned, wait for done... Round 0: Error on 0x5aa00, expected 32746, got 32743, 3 data loss! Round 0: Error on 0x395200, expected 32746, got 32743, 3 data loss! Round 0: Error on 0x3fd000, expected 32746, got 32737, 9 data loss! Round 0 Failed, 15 data loss! This reproducer spawns multiple threads sharing the same memory region using a small swap device. Every two threads updates mapped pages one by one in opposite direction trying to create a race, with one dedicated thread keep swapping out the data out using madvise. The reproducer created a reproduce rate of about once every 5 minutes, so the race should be totally possible in production. After this patch, I ran the reproducer for over a few hundred rounds and no data loss observed. Performance overhead is minimal, microbenchmark swapin 10G from 32G zram: Before: 10934698 us After: 11157121 us Cached: 13155355 us (Dropping SWP_SYNCHRONOUS_IO flag) [kasong@tencent.com: v4] Link: https://lkml.kernel.org/r/20240219082040.7495-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240206182559.32264-1-ryncsn@gmail.com Fixes: 0bcac06f27d7 ("mm, swap: skip swapcache for swapin of synchronous device") Reported-by: "Huang, Ying" Closes: https://lore.kernel.org/lkml/87bk92gqpx.fsf_-_@yhuang6-desk2.ccr.corp.intel.com/ Link: https://github.com/ryncsn/emm-test-project/tree/master/swap-stress-race [1] Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Acked-by: Yu Zhao Acked-by: David Hildenbrand Acked-by: Chris Li Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Yosry Ahmed Cc: Yu Zhao Cc: Barry Song <21cnbao@gmail.com> Cc: SeongJae Park Cc: Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4db00ddad261..8d28f6091a32 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -549,6 +549,11 @@ static inline int swap_duplicate(swp_entry_t swp) return 0; } +static inline int swapcache_prepare(swp_entry_t swp) +{ + return 0; +} + static inline void swap_free(swp_entry_t swp) { } -- cgit v1.2.3 From 2f3bfa8e30b5b4864a200be0dc2fb55d8e4b35e4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 17 Feb 2024 11:03:06 +0100 Subject: net: wan: framer: constify of_phandle_args in xlate The xlate callbacks are supposed to translate of_phandle_args to proper provider without modifying the of_phandle_args. Make the argument pointer to const for code safety and readability. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240217100306.86740-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- include/linux/framer/framer-provider.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h index 782cd5fc83d5..f6fd2dd92591 100644 --- a/include/linux/framer/framer-provider.h +++ b/include/linux/framer/framer-provider.h @@ -93,7 +93,7 @@ struct framer_provider { struct module *owner; struct list_head list; struct framer * (*of_xlate)(struct device *dev, - struct of_phandle_args *args); + const struct of_phandle_args *args); }; static inline void framer_set_drvdata(struct framer *framer, void *data) @@ -118,19 +118,19 @@ struct framer *devm_framer_create(struct device *dev, struct device_node *node, const struct framer_ops *ops); struct framer *framer_provider_simple_of_xlate(struct device *dev, - struct of_phandle_args *args); + const struct of_phandle_args *args); struct framer_provider * __framer_provider_of_register(struct device *dev, struct module *owner, struct framer *(*of_xlate)(struct device *dev, - struct of_phandle_args *args)); + const struct of_phandle_args *args)); void framer_provider_of_unregister(struct framer_provider *framer_provider); struct framer_provider * __devm_framer_provider_of_register(struct device *dev, struct module *owner, struct framer *(*of_xlate)(struct device *dev, - struct of_phandle_args *args)); + const struct of_phandle_args *args)); void framer_notify_status_change(struct framer *framer); @@ -154,7 +154,7 @@ static inline struct framer *devm_framer_create(struct device *dev, struct devic } static inline struct framer *framer_provider_simple_of_xlate(struct device *dev, - struct of_phandle_args *args) + const struct of_phandle_args *args) { return ERR_PTR(-ENOSYS); } @@ -162,7 +162,7 @@ static inline struct framer *framer_provider_simple_of_xlate(struct device *dev, static inline struct framer_provider * __framer_provider_of_register(struct device *dev, struct module *owner, struct framer *(*of_xlate)(struct device *dev, - struct of_phandle_args *args)) + const struct of_phandle_args *args)) { return ERR_PTR(-ENOSYS); } @@ -174,7 +174,7 @@ void framer_provider_of_unregister(struct framer_provider *framer_provider) static inline struct framer_provider * __devm_framer_provider_of_register(struct device *dev, struct module *owner, struct framer *(*of_xlate)(struct device *dev, - struct of_phandle_args *args)) + const struct of_phandle_args *args)) { return ERR_PTR(-ENOSYS); } -- cgit v1.2.3 From 557f8c582a9ba8abe6aa0fd734b6f342af106b26 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Jan 2024 15:06:05 -0800 Subject: ubsan: Reintroduce signed overflow sanitizer In order to mitigate unexpected signed wrap-around[1], bring back the signed integer overflow sanitizer. It was removed in commit 6aaa31aeb9cf ("ubsan: remove overflow checks") because it was effectively a no-op when combined with -fno-strict-overflow (which correctly changes signed overflow from being "undefined" to being explicitly "wrap around"). Compilers are adjusting their sanitizers to trap wrap-around and to detecting common code patterns that should not be instrumented (e.g. "var + offset < var"). Prepare for this and explicitly rename the option from "OVERFLOW" to "WRAP" to more accurately describe the behavior. To annotate intentional wrap-around arithmetic, the helpers wrapping_add/sub/mul_wrap() can be used for individual statements. At the function level, the __signed_wrap attribute can be used to mark an entire function as expecting its signed arithmetic to wrap around. For a single object file the Makefile can use "UBSAN_SIGNED_WRAP_target.o := n" to mark it as wrapping, and for an entire directory, "UBSAN_SIGNED_WRAP := n" can be used. Additionally keep these disabled under CONFIG_COMPILE_TEST for now. Link: https://github.com/KSPP/linux/issues/26 [1] Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Peter Zijlstra Cc: Hao Luo Reviewed-by: Marco Elver Reviewed-by: Justin Stitt Signed-off-by: Kees Cook --- include/linux/compiler_types.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6f1ca49306d2..ee9d272008a5 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -282,11 +282,18 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* Do not trap wrapping arithmetic within an annotated function. */ +#ifdef CONFIG_UBSAN_SIGNED_WRAP +# define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow"))) +#else +# define __signed_wrap +#endif + /* Section for code which can't be instrumented at all */ #define __noinstr_section(section) \ noinline notrace __attribute((__section__(section))) \ __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \ - __no_sanitize_memory + __no_sanitize_memory __signed_wrap #define noinstr __noinstr_section(".noinstr.text") -- cgit v1.2.3 From f478898e0aa74a759fcf629a3ee8b040467b8533 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Feb 2024 03:18:14 -0800 Subject: string: Redefine strscpy_pad() as a macro In preparation for making strscpy_pad()'s 3rd argument optional, redefine it as a macro. This also has the benefit of allowing greater FORITFY introspection, as it couldn't see into the strscpy() nor the memset() within strscpy_pad(). Cc: Andy Shevchenko Cc: Andrew Morton Cc: Reviewed-by: Justin Stitt Signed-off-by: Kees Cook --- include/linux/string.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index ab148d8dbfc1..78b28004c5ba 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -70,8 +70,37 @@ extern char * strncpy(char *,const char *, __kernel_size_t); ssize_t strscpy(char *, const char *, size_t); #endif -/* Wraps calls to strscpy()/memset(), no arch specific code required */ -ssize_t strscpy_pad(char *dest, const char *src, size_t count); +/** + * strscpy_pad() - Copy a C-string into a sized buffer + * @dest: Where to copy the string to + * @src: Where to copy the string from + * @count: Size of destination buffer + * + * Copy the string, or as much of it as fits, into the dest buffer. The + * behavior is undefined if the string buffers overlap. The destination + * buffer is always %NUL terminated, unless it's zero-sized. + * + * If the source string is shorter than the destination buffer, the + * remaining bytes in the buffer will be filled with %NUL bytes. + * + * For full explanation of why you may want to consider using the + * 'strscpy' functions please see the function docstring for strscpy(). + * + * Returns: + * * The number of characters copied (not including the trailing %NULs) + * * -E2BIG if count is 0 or @src was truncated. + */ +#define strscpy_pad(dest, src, count) ({ \ + char *__dst = (dest); \ + const char *__src = (src); \ + const size_t __count = (count); \ + ssize_t __wrote; \ + \ + __wrote = strscpy(__dst, __src, __count); \ + if (__wrote >= 0 && __wrote < __count) \ + memset(__dst + __wrote + 1, 0, __count - __wrote - 1); \ + __wrote; \ +}) #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); -- cgit v1.2.3 From e6584c3964f2ff76a9fb5a701e4a59997b35e547 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Sep 2023 12:38:14 -0700 Subject: string: Allow 2-argument strscpy() Using sizeof(dst) for the "size" argument in strscpy() is the overwhelmingly common case. Instead of requiring this everywhere, allow a 2-argument version to be used that will use the sizeof() internally. There are other functions in the kernel with optional arguments[1], so this isn't unprecedented, and improves readability. Update and relocate the kern-doc for strscpy() too, and drop __HAVE_ARCH_STRSCPY as it is unused. Adjust ARCH=um build to notice the changed export name, as it doesn't do full header includes for the string helpers. This could additionally let us save a few hundred lines of code: 1177 files changed, 2455 insertions(+), 3026 deletions(-) with a treewide cleanup using Coccinelle: @needless_arg@ expression DST, SRC; @@ strscpy(DST, SRC -, sizeof(DST) ) Link: https://elixir.bootlin.com/linux/v6.7/source/include/linux/pci.h#L1517 [1] Reviewed-by: Justin Stitt Cc: Andy Shevchenko Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 22 ++-------------------- include/linux/string.h | 38 +++++++++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 89a6888f2f9e..06b3aaa63724 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -215,26 +215,8 @@ __kernel_size_t __fortify_strlen(const char * const POS p) } /* Defined after fortified strnlen() to reuse it. */ -extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); -/** - * strscpy - Copy a C-string into a sized buffer - * - * @p: Where to copy the string to - * @q: Where to copy the string from - * @size: Size of destination buffer - * - * Copy the source string @q, or as much of it as fits, into the destination - * @p buffer. The behavior is undefined if the string buffers overlap. The - * destination @p buffer is always NUL terminated, unless it's zero-sized. - * - * Preferred to strncpy() since it always returns a valid string, and - * doesn't unnecessarily force the tail of the destination buffer to be - * zero padded. If padding is desired please use strscpy_pad(). - * - * Returns the number of characters copied in @p (not including the - * trailing %NUL) or -E2BIG if @size is 0 or the copy of @q was truncated. - */ -__FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, size_t size) +extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(sized_strscpy); +__FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const POS q, size_t size) { /* Use string size rather than possible enclosing struct size. */ const size_t p_size = __member_size(p); diff --git a/include/linux/string.h b/include/linux/string.h index 78b28004c5ba..0d66bf9407fd 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -2,6 +2,7 @@ #ifndef _LINUX_STRING_H_ #define _LINUX_STRING_H_ +#include #include #include /* for inline */ #include /* for size_t */ @@ -66,9 +67,40 @@ extern char * strcpy(char *,const char *); #ifndef __HAVE_ARCH_STRNCPY extern char * strncpy(char *,const char *, __kernel_size_t); #endif -#ifndef __HAVE_ARCH_STRSCPY -ssize_t strscpy(char *, const char *, size_t); -#endif +ssize_t sized_strscpy(char *, const char *, size_t); + +/* + * The 2 argument style can only be used when dst is an array with a + * known size. + */ +#define __strscpy0(dst, src, ...) \ + sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst)) +#define __strscpy1(dst, src, size) sized_strscpy(dst, src, size) + +/** + * strscpy - Copy a C-string into a sized buffer + * @dst: Where to copy the string to + * @src: Where to copy the string from + * @...: Size of destination buffer (optional) + * + * Copy the source string @src, or as much of it as fits, into the + * destination @dst buffer. The behavior is undefined if the string + * buffers overlap. The destination @dst buffer is always NUL terminated, + * unless it's zero-sized. + * + * The size argument @... is only required when @dst is not an array, or + * when the copy needs to be smaller than sizeof(@dst). + * + * Preferred to strncpy() since it always returns a valid string, and + * doesn't unnecessarily force the tail of the destination buffer to be + * zero padded. If padding is desired please use strscpy_pad(). + * + * Returns the number of characters copied in @dst (not including the + * trailing %NUL) or -E2BIG if @size is 0 or the copy from @src was + * truncated. + */ +#define strscpy(dst, src, ...) \ + CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) /** * strscpy_pad() - Copy a C-string into a sized buffer -- cgit v1.2.3 From 8366d124ec937c3815212c00daf00b687eb27969 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Feb 2024 03:40:23 -0800 Subject: string: Allow 2-argument strscpy_pad() Similar to strscpy(), update strscpy_pad()'s 3rd argument to be optional when the destination is a compile-time known size array. Cc: Andy Shevchenko Cc: Reviewed-by: Justin Stitt Signed-off-by: Kees Cook --- include/linux/string.h | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 0d66bf9407fd..96e6b1af86b5 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -77,6 +77,10 @@ ssize_t sized_strscpy(char *, const char *, size_t); sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst)) #define __strscpy1(dst, src, size) sized_strscpy(dst, src, size) +#define __strscpy_pad0(dst, src, ...) \ + sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst)) +#define __strscpy_pad1(dst, src, size) sized_strscpy_pad(dst, src, size) + /** * strscpy - Copy a C-string into a sized buffer * @dst: Where to copy the string to @@ -102,11 +106,23 @@ ssize_t sized_strscpy(char *, const char *, size_t); #define strscpy(dst, src, ...) \ CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) +#define sized_strscpy_pad(dest, src, count) ({ \ + char *__dst = (dest); \ + const char *__src = (src); \ + const size_t __count = (count); \ + ssize_t __wrote; \ + \ + __wrote = sized_strscpy(__dst, __src, __count); \ + if (__wrote >= 0 && __wrote < __count) \ + memset(__dst + __wrote + 1, 0, __count - __wrote - 1); \ + __wrote; \ +}) + /** * strscpy_pad() - Copy a C-string into a sized buffer - * @dest: Where to copy the string to + * @dst: Where to copy the string to * @src: Where to copy the string from - * @count: Size of destination buffer + * @...: Size of destination buffer * * Copy the string, or as much of it as fits, into the dest buffer. The * behavior is undefined if the string buffers overlap. The destination @@ -122,17 +138,8 @@ ssize_t sized_strscpy(char *, const char *, size_t); * * The number of characters copied (not including the trailing %NULs) * * -E2BIG if count is 0 or @src was truncated. */ -#define strscpy_pad(dest, src, count) ({ \ - char *__dst = (dest); \ - const char *__src = (src); \ - const size_t __count = (count); \ - ssize_t __wrote; \ - \ - __wrote = strscpy(__dst, __src, __count); \ - if (__wrote >= 0 && __wrote < __count) \ - memset(__dst + __wrote + 1, 0, __count - __wrote - 1); \ - __wrote; \ -}) +#define strscpy_pad(dst, src, ...) \ + CONCATENATE(__strscpy_pad, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); -- cgit v1.2.3 From e9a8e01f9b133c145dd125021ec47c006d108af4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 20 Feb 2024 19:36:14 -1000 Subject: workqueue: Clean up enum work_bits and related constants The bits of work->data are used for a few different purposes. How the bits are used is determined by enum work_bits. The planned disable/enable support will add another use, so let's clean it up a bit in preparation. - Let WORK_STRUCT_*_BIT's values be determined by enum definition order. - Deliminate different bit sections the same way using SHIFT and BITS values. - Rename __WORK_OFFQ_CANCELING to WORK_OFFQ_CANCELING_BIT for consistency. - Introduce WORK_STRUCT_PWQ_SHIFT and replace WORK_STRUCT_FLAG_MASK and WORK_STRUCT_WQ_DATA_MASK with WQ_STRUCT_PWQ_MASK for clarity. - Improve documentation. No functional changes. Signed-off-by: Tejun Heo Reviewed-by: Lai Jiangshan --- include/linux/workqueue.h | 58 ++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1565bab9edc8..0ad534fe6673 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -24,41 +24,49 @@ enum work_bits { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ - WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ - WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ - WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ + WORK_STRUCT_INACTIVE_BIT, /* work item is inactive */ + WORK_STRUCT_PWQ_BIT, /* data points to pwq */ + WORK_STRUCT_LINKED_BIT, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK - WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */ - WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */ -#else - WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ + WORK_STRUCT_STATIC_BIT, /* static initializer (debugobjects) */ #endif + WORK_STRUCT_FLAG_BITS, + /* color for workqueue flushing */ + WORK_STRUCT_COLOR_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_STRUCT_COLOR_BITS = 4, /* - * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. - * This makes pwqs aligned to 256 bytes and allows 16 workqueue - * flush colors. + * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/ + * debugobjects turned off. This makes pwqs aligned to 256 bytes (512 + * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors. + * + * MSB + * [ pwq pointer ] [ flush color ] [ STRUCT flags ] + * 4 bits 4 or 5 bits */ - WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + - WORK_STRUCT_COLOR_BITS, + WORK_STRUCT_PWQ_SHIFT = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, - /* data contains off-queue information when !WORK_STRUCT_PWQ */ - WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, - - __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, + /* + * data contains off-queue information when !WORK_STRUCT_PWQ. + * + * MSB + * [ pool ID ] [ OFFQ flags ] [ STRUCT flags ] + * 1 bit 4 or 5 bits + */ + WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, + WORK_OFFQ_CANCELING_BIT = WORK_OFFQ_FLAG_SHIFT, + WORK_OFFQ_FLAG_END, + WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, /* - * When a work item is off queue, its high bits point to the last - * pool it was on. Cap at 31 bits and use the highest number to - * indicate that no pool is associated. + * When a work item is off queue, the high bits encode off-queue flags + * and the last pool it was on. Cap pool ID to 31 bits and use the + * highest number to indicate that no pool is associated. */ - WORK_OFFQ_FLAG_BITS = 1, - WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, + WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, - }; enum work_flags { @@ -88,12 +96,10 @@ enum wq_misc_consts { }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ -#define WORK_OFFQ_CANCELING (1ul << __WORK_OFFQ_CANCELING) +#define WORK_OFFQ_CANCELING (1ul << WORK_OFFQ_CANCELING_BIT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) - -#define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) -#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) +#define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ -- cgit v1.2.3 From f0397e27d1204a7e6581d140c2de7fd11383c6ba Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Mon, 19 Feb 2024 11:07:48 -0700 Subject: Revert "bus: mhi: core: Add support for reading MHI info from device" This reverts commit 3316ab2b45f6bf4797d8d65b22fda3cc13318890. The MHI spec owner pointed out that the SOC_HW_VERSION register is part of the BHIe segment, and only valid on devices which implement BHIe. Only a small subset of MHI devices implement BHIe so blindly accessing the register for all devices is not correct. Also, since the BHIe segment offset is not used when accessing the register, any implementation which moves the BHIe segment will result in accessing some other register. We've seen that accessing this register on AIC100 which does not support BHIe can result in initialization failures. We could try to put checks into the code to address these issues, but in the roughly 4 years this functionality has existed, no one has used it. Easier to drop this dead code and address the issues if anyone comes up with a real world use for it. Signed-off-by: Jeffrey Hugo Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240219180748.1591527-1-quic_jhugo@quicinc.com Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 474d32cb0520..77b8c0a26674 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -320,10 +320,6 @@ struct mhi_controller_config { * @hw_ev_rings: Number of hardware event rings * @sw_ev_rings: Number of software event rings * @nr_irqs: Number of IRQ allocated by bus master (required) - * @family_number: MHI controller family number - * @device_number: MHI controller device number - * @major_version: MHI controller major revision number - * @minor_version: MHI controller minor revision number * @serial_number: MHI controller serial number obtained from BHI * @mhi_event: MHI event ring configurations table * @mhi_cmd: MHI command ring configurations table @@ -368,15 +364,6 @@ struct mhi_controller_config { * Fields marked as (required) need to be populated by the controller driver * before calling mhi_register_controller(). For the fields marked as (optional) * they can be populated depending on the usecase. - * - * The following fields are present for the purpose of implementing any device - * specific quirks or customizations for specific MHI revisions used in device - * by the controller drivers. The MHI stack will just populate these fields - * during mhi_register_controller(): - * family_number - * device_number - * major_version - * minor_version */ struct mhi_controller { struct device *cntrl_dev; @@ -407,10 +394,6 @@ struct mhi_controller { u32 hw_ev_rings; u32 sw_ev_rings; u32 nr_irqs; - u32 family_number; - u32 device_number; - u32 major_version; - u32 minor_version; u32 serial_number; struct mhi_event *mhi_event; -- cgit v1.2.3 From b64e74e95aa6491b31477e9002aab1d8df3995bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:09 +0100 Subject: mm: move mapping_set_update out of mapping_set_update is only used inside mm/. Move mapping_set_update to mm/internal.h and turn it into an inline function instead of a macro. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- include/linux/swap.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4db00ddad261..755fc64ba48d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -350,16 +350,6 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); -/* Only track the nodes of mappings with shadow entries */ -void workingset_update_node(struct xa_node *node); -extern struct list_lru shadow_nodes; -#define mapping_set_update(xas, mapping) do { \ - if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ - xas_set_update(xas, workingset_update_node); \ - xas_set_lru(xas, &shadow_nodes); \ - } \ -} while (0) - /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; -- cgit v1.2.3 From aefacb2041f77784059b86c5fd151066859ad19a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:10 +0100 Subject: shmem: move shmem_mapping out of line shmem_aops really should not be exported to the world. Move shmem_mapping and export it as internal for the one semi-legitimate modular user in udmabuf. This effectively reverts commit 30e6a51dbb05 ("mm/shmem.c: make shmem_mapping() inline"). which added a bogus shmem_aops non-GPL export for no reason whatsoever as there as no shmem_mapping call outside of core MM code at that point. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- include/linux/shmem_fs.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 2caa6b86106a..6b96a87e4bc8 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -97,11 +97,7 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM -extern const struct address_space_operations shmem_aops; -static inline bool shmem_mapping(struct address_space *mapping) -{ - return mapping->a_ops == &shmem_aops; -} +bool shmem_mapping(struct address_space *mapping); #else static inline bool shmem_mapping(struct address_space *mapping) { -- cgit v1.2.3 From f3e6b3ae9cfc128af11b665c6ef4022ba2683778 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 18 Feb 2024 14:28:57 -0800 Subject: acpi/ghes: Remove CXL CPER notifications Initial tests with the CXL CPER implementation identified that error reports were being duplicated in the log and the trace event [1]. Then it was discovered that the notification handler took sleeping locks while the GHES event handling runs in spin_lock_irqsave() context [2] While the duplicate reporting was fixed in v6.8-rc4, the fix for the sleeping-lock-vs-atomic collision would enjoy more time to settle and gain some test cycles. Given how late it is in the development cycle, remove the CXL hookup for now and try again during the next merge window. Note that end result is that v6.8 does not emit CXL CPER payloads to the kernel log, but this is in line with the CXL trend to move error reporting to trace events instead of the kernel log. Cc: Ard Biesheuvel Cc: Rafael J. Wysocki Cc: Jonathan Cameron Reviewed-by: Ira Weiny Link: http://lore.kernel.org/r/20240108165855.00002f5a@Huawei.com [1] Closes: http://lore.kernel.org/r/b963c490-2c13-4b79-bbe7-34c6568423c7@moroto.mountain [2] Signed-off-by: Dan Williams --- include/linux/cxl-event.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 91125eca4c8a..03fa6d50d46f 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -140,22 +140,4 @@ struct cxl_cper_event_rec { union cxl_event event; } __packed; -typedef void (*cxl_cper_callback)(enum cxl_event_type type, - struct cxl_cper_event_rec *rec); - -#ifdef CONFIG_ACPI_APEI_GHES -int cxl_cper_register_callback(cxl_cper_callback callback); -int cxl_cper_unregister_callback(cxl_cper_callback callback); -#else -static inline int cxl_cper_register_callback(cxl_cper_callback callback) -{ - return 0; -} - -static inline int cxl_cper_unregister_callback(cxl_cper_callback callback) -{ - return 0; -} -#endif - #endif /* _LINUX_CXL_EVENT_H */ -- cgit v1.2.3 From ecba88a3b32d733d41e27973e25b2bc580f64281 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:23:54 -0500 Subject: libfs: Add simple_offset_empty() For simple filesystems that use directory offset mapping, rely strictly on the directory offset map to tell when a directory has no children. After this patch is applied, the emptiness test holds only the RCU read lock when the directory being tested has no children. In addition, this adds another layer of confirmation that simple_offset_add/remove() are working as expected. Reviewed-by: Jan Kara Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820143463.6328.7872919188371286951.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..03d141809a2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3267,6 +3267,7 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); +int simple_offset_empty(struct dentry *dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, -- cgit v1.2.3 From 9b6713cc75229f25552c643083cbdbfb771e5bca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:24:01 -0500 Subject: maple_tree: Add mtree_alloc_cyclic() I need a cyclic allocator for the simple_offset implementation in fs/libfs.c. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820144179.6328.12838600511394432325.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- include/linux/maple_tree.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index b3d63123b945..a53ad4dabd7e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -171,6 +171,7 @@ enum maple_type { #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 +#define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 @@ -319,6 +320,9 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first, int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); +int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); @@ -499,6 +503,9 @@ void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); -- cgit v1.2.3 From 0e4a862174f2a8d1653a8a9cf0815020e1d3af24 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:24:16 -0500 Subject: libfs: Convert simple directory offsets to use a Maple Tree Test robot reports: > kernel test robot noticed a -19.0% regression of aim9.disk_src.ops_per_sec on: > > commit: a2e459555c5f9da3e619b7e47a63f98574dc75f1 ("shmem: stable directory offsets") > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master Feng Tang further clarifies that: > ... the new simple_offset_add() > called by shmem_mknod() brings extra cost related with slab, > specifically the 'radix_tree_node', which cause the regression. Willy's analysis is that, over time, the test workload causes xa_alloc_cyclic() to fragment the underlying SLAB cache. This patch replaces the offset_ctx's xarray with a Maple Tree in the hope that Maple Tree's dense node mode will handle this scenario more scalably. In addition, we can widen the simple directory offset maximum to signed long (as loff_t is also signed). Suggested-by: Matthew Wilcox Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202309081306.3ecb3734-oliver.sang@intel.com Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820145616.6328.12620992971699079156.stgit@91.116.238.104.host.secureserver.net Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 03d141809a2c..55144c12ee0f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -3260,8 +3261,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { - struct xarray xa; - u32 next_offset; + struct maple_tree mt; + unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); -- cgit v1.2.3 From 26ea8511c849f9fea325bcdbd8b41ea031a53afe Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 29 Jan 2024 12:52:11 +0100 Subject: of: Add of_phandle_args_equal() helper Add a helper comparing two "struct of_phandle_args" to avoid reinventing the wheel. Reviewed-by: Philipp Zabel Acked-by: Rob Herring Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240129115216.96479-2-krzysztof.kozlowski@linaro.org Signed-off-by: Philipp Zabel --- include/linux/of.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 6a9ddf20e79a..85bcc05b278d 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1065,6 +1065,22 @@ static inline int of_parse_phandle_with_optional_args(const struct device_node * 0, index, out_args); } +/** + * of_phandle_args_equal() - Compare two of_phandle_args + * @a1: First of_phandle_args to compare + * @a2: Second of_phandle_args to compare + * + * Return: True if a1 and a2 are the same (same node pointer, same phandle + * args), false otherwise. + */ +static inline bool of_phandle_args_equal(const struct of_phandle_args *a1, + const struct of_phandle_args *a2) +{ + return a1->np == a2->np && + a1->args_count == a2->args_count && + !memcmp(a1->args, a2->args, sizeof(a1->args[0]) * a1->args_count); +} + /** * of_property_count_u8_elems - Count the number of u8 elements in a property * -- cgit v1.2.3 From 0f28982835c29752cdb657f1f8316b2ea42c407a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 29 Jan 2024 12:52:12 +0100 Subject: cpufreq: do not open-code of_phandle_args_equal() Use newly added of_phandle_args_equal() helper to compare two of_phandle_args. Acked-by: Viresh Kumar Reviewed-by: Philipp Zabel Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240129115216.96479-3-krzysztof.kozlowski@linaro.org Signed-off-by: Philipp Zabel --- include/linux/cpufreq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index afda5f24d3dd..3cd06dafb04b 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1149,8 +1149,7 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ if (ret < 0) continue; - if (pargs->np == args.np && pargs->args_count == args.args_count && - !memcmp(pargs->args, args.args, sizeof(args.args[0]) * args.args_count)) + if (of_phandle_args_equal(pargs, &args)) cpumask_set_cpu(cpu, cpumask); of_node_put(args.np); -- cgit v1.2.3 From c721f189e89c0d4db119d7bb2b46768d0fb5f6b1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 29 Jan 2024 12:52:14 +0100 Subject: reset: Instantiate reset GPIO controller for shared reset-gpios Devices sharing a reset GPIO could use the reset framework for coordinated handling of that shared GPIO line. We have several cases of such needs, at least for Devicetree-based platforms. If Devicetree-based device requests a reset line, while "resets" Devicetree property is missing but there is a "reset-gpios" one, instantiate a new "reset-gpio" platform device which will handle such reset line. This allows seamless handling of such shared reset-gpios without need of changing Devicetree binding [1]. To avoid creating multiple "reset-gpio" platform devices, store the Devicetree "reset-gpios" GPIO specifiers used for new devices on a linked list. Later such Devicetree GPIO specifier (phandle to GPIO controller, GPIO number and GPIO flags) is used to check if reset controller for given GPIO was already registered. If two devices have conflicting "reset-gpios" property, e.g. with different ACTIVE_xxx flags, this would allow to spawn two separate "reset-gpio" devices, where the second would fail probing on busy GPIO request. Link: https://lore.kernel.org/all/YXi5CUCEi7YmNxXM@robh.at.kernel.org/ [1] Cc: Bartosz Golaszewski Cc: Chris Packham Cc: Sean Anderson Reviewed-by: Philipp Zabel Signed-off-by: Krzysztof Kozlowski Acked-by: Bartosz Golaszewski Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20240129115216.96479-5-krzysztof.kozlowski@linaro.org Signed-off-by: Philipp Zabel --- include/linux/reset-controller.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 0fa4f60e1186..357df16ede32 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -60,6 +60,9 @@ struct reset_control_lookup { * @reset_control_head: head of internal list of requested reset controls * @dev: corresponding driver model device struct * @of_node: corresponding device tree node as phandle target + * @of_args: for reset-gpios controllers: corresponding phandle args with + * of_node and GPIO number complementing of_node; either this or + * of_node should be present * @of_reset_n_cells: number of cells in reset line specifiers * @of_xlate: translation function to translate from specifier as found in the * device tree to id as given to the reset control ops, defaults @@ -73,6 +76,7 @@ struct reset_controller_dev { struct list_head reset_control_head; struct device *dev; struct device_node *of_node; + const struct of_phandle_args *of_args; int of_reset_n_cells; int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); -- cgit v1.2.3 From 2ed08e4bc53298db3f87b528cd804cb0cce066a9 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 21 Feb 2024 14:08:59 +0800 Subject: clocksource: Scale the watchdog read retries automatically On a 8-socket server the TSC is wrongly marked as 'unstable' and disabled during boot time on about one out of 120 boot attempts: clocksource: timekeeping watchdog on CPU227: wd-tsc-wd excessive read-back delay of 153560ns vs. limit of 125000ns, wd-wd read-back delay only 11440ns, attempt 3, marking tsc unstable tsc: Marking TSC unstable due to clocksource watchdog TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'. sched_clock: Marking unstable (119294969739, 159204297)<-(125446229205, -5992055152) clocksource: Checking clocksource tsc synchronization from CPU 319 to CPUs 0,99,136,180,210,542,601,896. clocksource: Switched to clocksource hpet The reason is that for platform with a large number of CPUs, there are sporadic big or huge read latencies while reading the watchog/clocksource during boot or when system is under stress work load, and the frequency and maximum value of the latency goes up with the number of online CPUs. The cCurrent code already has logic to detect and filter such high latency case by reading the watchdog twice and checking the two deltas. Due to the randomness of the latency, there is a low probabilty that the first delta (latency) is big, but the second delta is small and looks valid. The watchdog code retries the readouts by default twice, which is not necessarily sufficient for systems with a large number of CPUs. There is a command line parameter 'max_cswd_read_retries' which allows to increase the number of retries, but that's not user friendly as it needs to be tweaked per system. As the number of required retries is proportional to the number of online CPUs, this parameter can be calculated at runtime. Scale and enlarge the number of retries according to the number of online CPUs and remove the command line parameter completely. [ tglx: Massaged change log and comments ] Signed-off-by: Feng Tang Signed-off-by: Thomas Gleixner Tested-by: Jin Wang Tested-by: Paul E. McKenney Reviewed-by: Waiman Long Reviewed-by: Paul E. McKenney Link: https://lore.kernel.org/r/20240221060859.1027450-1-feng.tang@intel.com --- include/linux/clocksource.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 1d42d4b17327..0ad8b550bb4b 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -291,7 +291,19 @@ static inline void timer_probe(void) {} #define TIMER_ACPI_DECLARE(name, table_id, fn) \ ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn) -extern ulong max_cswd_read_retries; +static inline unsigned int clocksource_get_max_watchdog_retry(void) +{ + /* + * When system is in the boot phase or under heavy workload, there + * can be random big latencies during the clocksource/watchdog + * read, so allow retries to filter the noise latency. As the + * latency's frequency and maximum value goes up with the number of + * CPUs, scale the number of retries with the number of online + * CPUs. + */ + return (ilog2(num_online_cpus()) / 2) + 1; +} + void clocksource_verify_percpu(struct clocksource *cs); #endif /* _LINUX_CLOCKSOURCE_H */ -- cgit v1.2.3 From 3f801968889459ecae1eab524b039676e6eaa319 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 14 Feb 2024 14:41:02 +0100 Subject: netfilter: move nf_reinject into nfnetlink_queue modules No need to keep this in the core, move it to the nfnetlink_queue module. nf_reroute is moved too, there were no other callers. Signed-off-by: Florian Westphal --- include/linux/netfilter.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 80900d910992..ffb5e0297eb5 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -370,7 +370,6 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, u_int8_t protocol, unsigned short family); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family); -int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry); #include -- cgit v1.2.3 From 78b88ef392c1ccc189c74cf73c179cf59d23a258 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 19 Feb 2024 17:45:48 +0000 Subject: net: wan: framer: remove children from struct framer_ops kdoc Remove documentation of non-existent children field from the Kernel doc for struct framer_ops. Introduced by 82c944d05b1a ("net: wan: Add framer framework support") Signed-off-by: Simon Horman Acked-by: Herve Codina Signed-off-by: David S. Miller --- include/linux/framer/framer-provider.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h index f6fd2dd92591..9724d4b44b9c 100644 --- a/include/linux/framer/framer-provider.h +++ b/include/linux/framer/framer-provider.h @@ -83,7 +83,6 @@ struct framer_ops { /** * struct framer_provider - represents the framer provider * @dev: framer provider device - * @children: can be used to override the default (dev->of_node) child node * @owner: the module owner having of_xlate * @list: to maintain a linked list of framer providers * @of_xlate: function pointer to obtain framer instance from framer pointer -- cgit v1.2.3 From ee975351cf0c2a11cdf97eae58265c126cb32850 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 19 Feb 2024 12:40:51 -0800 Subject: net: mdio: mdio-bcm-unimac: Manage clock around I/O accesses Up until now we have managed not to have the mdio-bcm-unimac manage its clock except during probe and suspend/resume. This works most of the time, except where it does not. With a fully modular build, we can get into a situation whereby the GENET driver is fully registered, and so is the mdio-bcm-unimac driver, however the Ethernet PHY driver is not yet, because it depends on a resource that is not yet available (e.g.: GPIO provider). In that state, the network device is not usable yet, and so to conserve power, the GENET driver will have turned off its "main" clock which feeds its MDIO controller. When the PHY driver finally probes however, we make an access to the PHY registers to e.g.: disable interrupts, and this causes a bus error within the MDIO controller space because the MDIO controller clock(s) are turned off. To remedy that, we manage the clock around all of the I/O accesses to the hardware which are done exclusively during read, write and clock divider configuration. This ensures that the register space is accessible, and this also ensures that there are not unnecessarily elevated reference counts keeping the clocks active when the network device is administratively turned off. It would be the case with the previous way of managing the clock. Reviewed-by: Jacob Keller Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/platform_data/mdio-bcm-unimac.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/mdio-bcm-unimac.h b/include/linux/platform_data/mdio-bcm-unimac.h index 8a5f9f0b2c52..724e1f57b81f 100644 --- a/include/linux/platform_data/mdio-bcm-unimac.h +++ b/include/linux/platform_data/mdio-bcm-unimac.h @@ -1,11 +1,14 @@ #ifndef __MDIO_BCM_UNIMAC_PDATA_H #define __MDIO_BCM_UNIMAC_PDATA_H +struct clk; + struct unimac_mdio_pdata { u32 phy_mask; int (*wait_func)(void *data); void *wait_func_data; const char *bus_name; + struct clk *clk; }; #define UNIMAC_MDIO_DRV_NAME "unimac-mdio" -- cgit v1.2.3 From f79ab5d2bced9bd7c0ce86d2aa5b70d053001bb4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Feb 2024 17:41:36 +0200 Subject: wifi: cfg80211: Add KHZ_PER_GHZ to units.h and reuse The KHZ_PER_GHZ might be used by others (with the name aligned with similar constants). Define it in units.h and convert wireless to use it. Signed-off-by: Andy Shevchenko Reviewed-by: Simon Horman Link: https://msgid.link/20240215154136.630029-1-andriy.shevchenko@linux.intel.com Signed-off-by: Johannes Berg --- include/linux/units.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index 45110daaf8d3..00e15de33eca 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -24,10 +24,13 @@ #define NANOHZ_PER_HZ 1000000000UL #define MICROHZ_PER_HZ 1000000UL #define MILLIHZ_PER_HZ 1000UL + #define HZ_PER_KHZ 1000UL -#define KHZ_PER_MHZ 1000UL #define HZ_PER_MHZ 1000000UL +#define KHZ_PER_MHZ 1000UL +#define KHZ_PER_GHZ 1000000UL + #define MILLIWATT_PER_WATT 1000UL #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL -- cgit v1.2.3 From 6bd14aee0bd25525ab229acd9bfe536dd8642364 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 Feb 2024 13:54:31 +0200 Subject: wifi: mac80211: align ieee80211_mle_get_bss_param_ch_cnt() Align the prototype of ieee80211_mle_get_bss_param_ch_cnt() to also take a u8 * like the other functions, and make it return -1 when the field isn't found, so that mac80211 can check that instead of explicitly open-coding the check. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240216135047.583309181bc3.Ia61cb0b4fc034d5ac8fcfaf6f6fb2e115fadafe7@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e4322238f273..303c75459897 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4990,17 +4990,18 @@ static inline int ieee80211_mle_get_link_id(const u8 *data) /** * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count - * @mle: the basic multi link element + * @data: pointer to the basic multi link element * * The element is assumed to be of the correct type (BASIC) and big enough, * this must be checked using ieee80211_mle_type_ok(). * * If the BSS parameter change count value can't be found (the presence bit - * for it is clear), 0 will be returned. + * for it is clear), -1 will be returned. */ -static inline u8 -ieee80211_mle_get_bss_param_ch_cnt(const struct ieee80211_multi_link_elem *mle) +static inline int +ieee80211_mle_get_bss_param_ch_cnt(const u8 *data) { + const struct ieee80211_multi_link_elem *mle = (const void *)data; u16 control = le16_to_cpu(mle->control); const u8 *common = mle->variable; @@ -5008,7 +5009,7 @@ ieee80211_mle_get_bss_param_ch_cnt(const struct ieee80211_multi_link_elem *mle) common += sizeof(struct ieee80211_mle_basic_common_info); if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)) - return 0; + return -1; if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) common += 1; -- cgit v1.2.3 From 894dd84e49ec114a2dde7b312ae4cada40d15bdb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 Feb 2024 13:54:32 +0200 Subject: wifi: cfg80211: use ML element parsing helpers Use the existing ML element parsing helpers and add a new one for this (ieee80211_mle_get_mld_id). Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240216135047.4da47b1f035b.I437a5570ac456449facb0b147851ef24a1e473c2@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 303c75459897..3385a2cc5b09 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -5115,6 +5115,44 @@ static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) return get_unaligned_le16(common); } +/** + * ieee80211_mle_get_mld_id - returns the MLD ID + * @data: pointer to the multi link element + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + * + * If the MLD ID is not present, 0 will be returned. + */ +static inline u8 ieee80211_mle_get_mld_id(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + + return *common; +} + /** * ieee80211_mle_size_ok - validate multi-link element size * @data: pointer to the element data -- cgit v1.2.3 From b820de741ae48ccf50dd95e297889c286ff4f760 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 15 Feb 2024 12:47:38 -0800 Subject: fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted via libaio If kiocb_set_cancel_fn() is called for I/O submitted via io_uring, the following kernel warning appears: WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8 Call trace: kiocb_set_cancel_fn+0x9c/0xa8 ffs_epfile_read_iter+0x144/0x1d0 io_read+0x19c/0x498 io_issue_sqe+0x118/0x27c io_submit_sqes+0x25c/0x5fc __arm64_sys_io_uring_enter+0x104/0xab0 invoke_syscall+0x58/0x11c el0_svc_common+0xb4/0xf4 do_el0_svc+0x2c/0xb0 el0_svc+0x2c/0xa4 el0t_64_sync_handler+0x68/0xb4 el0t_64_sync+0x1a4/0x1a8 Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is submitted by libaio. Suggested-by: Jens Axboe Cc: Christoph Hellwig Cc: Avi Kivity Cc: Sandeep Dhavale Cc: Jens Axboe Cc: Greg Kroah-Hartman Cc: Kent Overstreet Cc: stable@vger.kernel.org Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240215204739.2677806-2-bvanassche@acm.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..c2dcc98cb4c8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -352,6 +352,8 @@ enum rw_hint { * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) +/* kiocb is a read or write operation submitted by fs/aio.c. */ +#define IOCB_AIO_RW (1 << 23) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ -- cgit v1.2.3 From a3c70a3cf11eb4b6409afc2cce1a3747e1dfe96f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 20 Feb 2024 15:50:01 -0800 Subject: bpf: Shrink size of struct bpf_map/bpf_array. Back in 2018 the commit be95a845cc44 ("bpf: avoid false sharing of map refcount with max_entries") added ____cacheline_aligned to "struct bpf_map" to make sure that fields like refcnt don't share a cache line with max_entries that is used to bounds check map access. That was done to make spectre style attacks harder. The main mitigation is done via code similar to array_index_nospec(), of course. This was an additional precaution. It increased the size of "struct bpf_map" a little, but it's affect on all other maps (like array) is significant, since "struct bpf_map" is typically the first member in other map types. Undo this ____cacheline_aligned tag. Instead move freeze_mutex field around, so that refcnt and max_entries are still in different cache lines. The main effect is seen in sizeof(struct bpf_array) that reduces from 320 to 248 bytes. BEFORE: struct bpf_map { const struct bpf_map_ops * ops; /* 0 8 */ ... char name[16]; /* 96 16 */ /* XXX 16 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ ... /* size: 256, cachelines: 4, members: 30 */ /* sum members: 232, holes: 1, sum holes: 16 */ /* padding: 8 */ /* paddings: 1, sum paddings: 2 */ } __attribute__((__aligned__(64))); struct bpf_array { struct bpf_map map; /* 0 256 */ ... /* size: 320, cachelines: 5, members: 5 */ /* padding: 48 */ /* paddings: 1, sum paddings: 8 */ } __attribute__((__aligned__(64))); AFTER: struct bpf_map { /* size: 232, cachelines: 4, members: 30 */ /* paddings: 1, sum paddings: 2 */ /* last cacheline: 40 bytes */ }; struct bpf_array { /* size: 248, cachelines: 4, members: 5 */ /* last cacheline: 56 bytes */ }; Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20240220235001.57411-1-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c7aa99b44dbd..814dc913a968 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -251,10 +251,7 @@ struct bpf_list_node_kern { } __attribute__((aligned(8))); struct bpf_map { - /* The first two cachelines with read-mostly members of which some - * are also accessed in fast-path (e.g. ops, max_entries). - */ - const struct bpf_map_ops *ops ____cacheline_aligned; + const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY void *security; @@ -276,17 +273,14 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - /* The 3rd and 4th cacheline with misc members to avoid false sharing - * particularly with refcounting. - */ - atomic64_t refcnt ____cacheline_aligned; + struct mutex freeze_mutex; + atomic64_t refcnt; atomic64_t usercnt; /* rcu is used before freeing and work is only used during freeing */ union { struct work_struct work; struct rcu_head rcu; }; - struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program * that is going to use this map or by the first program which FD is -- cgit v1.2.3 From ac0c530619cefa68fba816dabbcf6f4ffbf60c3d Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Sun, 18 Feb 2024 08:57:39 +0100 Subject: net: phy: Support 100/1000BT1 linkmode advertisements Extend helper functions mii_t1_adv_m_mod_linkmode_t and linkmode_adv_to_mii_t1_adv_m_t to support 100BT1 and 1000BT1 linkmode advertisements. Reviewed-by: Andrew Lunn Signed-off-by: Dimitri Fedrau Link: https://lore.kernel.org/r/20240218075753.18067-3-dima.fedrau@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index fd8ff310f9eb..68f8d2e970d4 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -373,6 +373,10 @@ static inline void mii_t1_adv_m_mod_linkmode_t(unsigned long *advertising, u32 l { linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, advertising, lpa & MDIO_AN_T1_ADV_M_B10L); + linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT1_Full_BIT, + advertising, lpa & MDIO_AN_T1_ADV_M_100BT1); + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT1_Full_BIT, + advertising, lpa & MDIO_AN_T1_ADV_M_1000BT1); } /** @@ -409,6 +413,10 @@ static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising) if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT1L_Full_BIT, advertising)) result |= MDIO_AN_T1_ADV_M_B10L; + if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT1_Full_BIT, advertising)) + result |= MDIO_AN_T1_ADV_M_100BT1; + if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT1_Full_BIT, advertising)) + result |= MDIO_AN_T1_ADV_M_1000BT1; return result; } -- cgit v1.2.3 From e57e4c7f6cc943be3346f938361334bb3634db3d Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Sun, 18 Feb 2024 08:57:42 +0100 Subject: net: phy: marvell-88q2xxx: add driver for the Marvell 88Q2220 PHY Add a driver for the Marvell 88Q2220. This driver allows to detect the link, switch between 100BASE-T1 and 1000BASE-T1 and switch between master and slave mode. Autonegotiation is supported. Reviewed-by: Andrew Lunn Tested-by: Gregor Herburger Signed-off-by: Dimitri Fedrau Link: https://lore.kernel.org/r/20240218075753.18067-6-dima.fedrau@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/marvell_phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index 9b54c4f0677f..693eba9869e4 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -26,6 +26,7 @@ #define MARVELL_PHY_ID_88E2110 0x002b09b0 #define MARVELL_PHY_ID_88X2222 0x01410f10 #define MARVELL_PHY_ID_88Q2110 0x002b0980 +#define MARVELL_PHY_ID_88Q2220 0x002b0b20 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */ #define MARVELL_PHY_ID_88E1111_FINISAR 0x01ff0cc0 -- cgit v1.2.3 From c5f1e2d1890935a734c302b9b8579748222b8e1e Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 8 Jan 2024 14:27:43 +0100 Subject: mm/memory_hotplug: introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers Patch series "implement "memmap on memory" feature on s390". This series provides "memmap on memory" support on s390 platform. "memmap on memory" allows struct pages array to be allocated from the hotplugged memory range instead of allocating it from main system memory. s390 currently preallocates struct pages array for all potentially possible memory, which ensures memory onlining always succeeds, but with the cost of significant memory consumption from the available system memory during boottime. In certain extreme configuration, this could lead to ipl failure. "memmap on memory" ensures struct pages array are populated from self contained hotplugged memory range instead of depleting the available system memory and this could eliminate ipl failure on s390 platform. On other platforms, system might go OOM when the physically hotplugged memory depletes the available memory before it is onlined. Hence, "memmap on memory" feature was introduced as described in commit a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range"). Unlike other architectures, s390 memory blocks are not physically accessible until it is online. To make it physically accessible two new memory notifiers MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE are added and this notifier lets the hypervisor inform that the memory should be made physically accessible. This allows for "memmap on memory" initialization during memory hotplug onlining phase, which is performed before calling MEM_GOING_ONLINE notifier. Patch 1 introduces MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to prepare the transition of memory to and from a physically accessible state. New mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced to ensure altmap cannot be written when adding memory - before it is set online. This enhancement is crucial for implementing the "memmap on memory" feature for s390 in a subsequent patch. Patches 2 allocates vmemmap pages from self-contained memory range for s390. It allocates memory map (struct pages array) from the hotplugged memory range, rather than using system memory by passing altmap to vmemmap functions. Patch 3 removes unhandled memory notifier types on s390. Patch 4 implements MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers on s390. MEM_PREPARE_ONLINE memory notifier makes memory block physical accessible via sclp assign command. The notifier ensures self-contained memory maps are accessible and hence enabling the "memmap on memory" on s390. MEM_FINISH_OFFLINE memory notifier shifts the memory block to an inaccessible state via sclp unassign command. Patch 5 finally enables MHP_MEMMAP_ON_MEMORY on s390. This patch (of 5): Introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers to prepare the transition of memory to and from a physically accessible state. This enhancement is crucial for implementing the "memmap on memory" feature for s390 in a subsequent patch. Platforms such as x86 can support physical memory hotplug via ACPI. When there is physical memory hotplug, ACPI event leads to the memory addition with the following callchain: acpi_memory_device_add() -> acpi_memory_enable_device() -> __add_memory() After this, the hotplugged memory is physically accessible, and altmap support prepared, before the "memmap on memory" initialization in memory_block_online() is called. On s390, memory hotplug works in a different way. The available hotplug memory has to be defined upfront in the hypervisor, but it is made physically accessible only when the user sets it online via sysfs, currently in the MEM_GOING_ONLINE notifier. This is too late and "memmap on memory" initialization is performed before calling MEM_GOING_ONLINE notifier. During the memory hotplug addition phase, altmap support is prepared and during the memory onlining phase s390 requires memory to be physically accessible and then subsequently initiate the "memmap on memory" initialization process. The memory provider will handle new MEM_PREPARE_ONLINE / MEM_FINISH_OFFLINE notifications and make the memory accessible. The mhp_flag MHP_OFFLINE_INACCESSIBLE is introduced and is relevant when used along with MHP_MEMMAP_ON_MEMORY, because the altmap cannot be written (e.g., poisoned) when adding memory -- before it is set online. This allows for adding memory with an altmap that is not currently made available by a hypervisor. When onlining that memory, the hypervisor can be instructed to make that memory accessible via the new notifiers and the onlining phase will not require any memory allocations, which is helpful in low-memory situations. All architectures ignore unknown memory notifiers. Therefore, the introduction of these new notifiers does not result in any functional modifications across architectures. Link: https://lkml.kernel.org/r/20240108132747.3238763-1-sumanthk@linux.ibm.com Link: https://lkml.kernel.org/r/20240108132747.3238763-2-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Suggested-by: Gerald Schaefer Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- include/linux/memory.h | 9 +++++++++ include/linux/memory_hotplug.h | 18 +++++++++++++++++- include/linux/memremap.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index f53cfdaaaa41..939a16bd5cea 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order); #define MEM_GOING_ONLINE (1<<3) #define MEM_CANCEL_ONLINE (1<<4) #define MEM_CANCEL_OFFLINE (1<<5) +#define MEM_PREPARE_ONLINE (1<<6) +#define MEM_FINISH_OFFLINE (1<<7) struct memory_notify { + /* + * The altmap_start_pfn and altmap_nr_pages fields are designated for + * specifying the altmap range and are exclusively intended for use in + * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + */ + unsigned long altmap_start_pfn; + unsigned long altmap_nr_pages; unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7d2076583494..ee00015575aa 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -106,6 +106,22 @@ typedef int __bitwise mhp_t; * implies the node id (nid). */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) +/* + * The hotplugged memory is completely inaccessible while the memory is + * offline. The memory provider will handle MEM_PREPARE_ONLINE / + * MEM_FINISH_OFFLINE notifications and make the memory accessible. + * + * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY, + * because the altmap cannot be written (e.g., poisoned) when adding + * memory -- before it is set online. + * + * This allows for adding memory with an altmap that is not currently + * made available by a hypervisor. When onlining that memory, the + * hypervisor can be instructed to make that memory available, and + * the onlining phase will not require any memory allocations, which is + * helpful in low-memory situations. + */ +#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3)) /* * Extended parameters for memory hotplug: @@ -154,7 +170,7 @@ extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone); + struct zone *zone, bool mhp_off_inaccessible); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 744c830f4b13..9837f3e6fb95 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -25,6 +25,7 @@ struct vmem_altmap { unsigned long free; unsigned long align; unsigned long alloc; + bool inaccessible; }; /* -- cgit v1.2.3 From a02b8bfe9a60ecb97bab7ba1a10a513bf78a7866 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Thu, 28 Dec 2023 06:27:15 +0000 Subject: mm: list_lru: remove unused macro list_lru_init_key() list_lru_init_key() isn't used by anyone, remove it to clean up. Link: https://lkml.kernel.org/r/20231228062715.338672-2-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 7675a48a0701..c679e6b293c4 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -62,8 +62,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, #define list_lru_init(lru) \ __list_lru_init((lru), false, NULL, NULL) -#define list_lru_init_key(lru, key) \ - __list_lru_init((lru), false, (key), NULL) #define list_lru_init_memcg(lru, shrinker) \ __list_lru_init((lru), true, NULL, shrinker) -- cgit v1.2.3 From 5662400a9ac03f38ef3b84e4ff9a640a4604bef9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 15:24:20 +0000 Subject: mm: add pfn_swap_entry_folio() Patch series "mm: convert mm counter to take a folio", v3. Make sure all mm_counter() and mm_counter_file() callers have a folio, then convert mm counter functions to take a folio, which saves some compound_head() calls. This patch (of 10): Thanks to the compound_head() hidden inside PageLocked(), this saves a call to compound_head() over calling page_folio(pfn_swap_entry_to_page()) Link: https://lkml.kernel.org/r/20240111152429.3374566-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240111152429.3374566-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/swapops.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bff1e8d97de0..48b700ba1d18 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) return p; } +static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) +{ + struct folio *folio = pfn_folio(swp_offset_pfn(entry)); + + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked + */ + BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); + + return folio; +} + /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They are used to represent unaddressable device memory -- cgit v1.2.3 From a23f517b0e1554467b0eb3bc1ebcb4d626217302 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:28 +0000 Subject: mm: convert mm_counter() to take a folio Now all callers of mm_counter() have a folio, convert mm_counter() to take a folio. Saves a call to compound_head() hidden inside PageAnon(). Link: https://lkml.kernel.org/r/20240111152429.3374566-10-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..22e597b36b38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2603,11 +2603,11 @@ static inline int mm_counter_file(struct page *page) return MM_FILEPAGES; } -static inline int mm_counter(struct page *page) +static inline int mm_counter(struct folio *folio) { - if (PageAnon(page)) + if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(page); + return mm_counter_file(&folio->page); } static inline unsigned long get_mm_rss(struct mm_struct *mm) -- cgit v1.2.3 From 6b27cc6c66abf0f0b091a95ca1ad4e0fc68c11fd Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 11 Jan 2024 15:24:29 +0000 Subject: mm: convert mm_counter_file() to take a folio Now all callers of mm_counter_file() have a folio, convert mm_counter_file() to take a folio. Saves a call to compound_head() hidden inside PageSwapBacked(). Link: https://lkml.kernel.org/r/20240111152429.3374566-11-willy@infradead.org Signed-off-by: Kefeng Wang Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 22e597b36b38..ac6b71cbdffb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2595,10 +2595,10 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member) mm_trace_rss_stat(mm, member); } -/* Optimized variant when page is already known not to be PageAnon */ -static inline int mm_counter_file(struct page *page) +/* Optimized variant when folio is already known not to be anon */ +static inline int mm_counter_file(struct folio *folio) { - if (PageSwapBacked(page)) + if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } @@ -2607,7 +2607,7 @@ static inline int mm_counter(struct folio *folio) { if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(&folio->page); + return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) -- cgit v1.2.3 From f796feabb9f5b1e5c48780a7a0023ab4b82336dd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Feb 2024 12:00:01 +0100 Subject: udp: add local "peek offset enabled" flag We want to re-organize the struct sock layout. The sk_peek_off field location is problematic, as most protocols want it in the RX read area, while UDP wants it on a cacheline different from sk_receive_queue. Create a local (inside udp_sock) copy of the 'peek offset is enabled' flag and place it inside the same cacheline of reader_queue. Check such flag before reading sk_peek_off. This will save potential false sharing and cache misses in the fast-path. Tested under UDP flood with small packets. The struct sock layout update causes a 4% performance drop, and this patch restores completely the original tput. Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/67ab679c15fbf49fa05b3ffe05d91c47ab84f147.1708426665.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index d04188714dca..3748e82b627b 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -92,6 +92,9 @@ struct udp_sock { /* This fields follows rcvbuf value, and is touched by udp_recvmsg */ int forward_threshold; + + /* Cache friendly copy of sk->sk_peek_off >= 0 */ + bool peeking_with_offset; }; #define udp_test_bit(nr, sk) \ @@ -109,6 +112,13 @@ struct udp_sock { #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) +static inline int udp_set_peek_off(struct sock *sk, int val) +{ + sk_set_peek_off(sk, val); + WRITE_ONCE(udp_sk(sk)->peeking_with_offset, val >= 0); + return 0; +} + static inline void udp_set_no_check6_tx(struct sock *sk, bool val) { udp_assign_bit(NO_CHECK6_TX, sk, val); -- cgit v1.2.3 From 265b07df758a998f60cf5b5aec6bd72ca676655e Mon Sep 17 00:00:00 2001 From: Shradha Todi Date: Tue, 20 Feb 2024 14:10:45 +0530 Subject: clk: Provide managed helper to get and enable bulk clocks Provide a managed devm_clk_bulk* wrapper to get and enable all bulk clocks in order to simplify drivers that keeps all clocks enabled for the time of driver operation. Suggested-by: Marek Szyprowski Reviewed-by: Alim Akhtar Reviewed-by: Manivannan Sadhasivam Signed-off-by: Shradha Todi Link: https://lore.kernel.org/r/20240220084046.23786-2-shradha.t@samsung.com Signed-off-by: Stephen Boyd --- include/linux/clk.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index 06f1b292f8a0..0f44d3863de2 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -478,6 +478,22 @@ int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks, int __must_check devm_clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks); +/** + * devm_clk_bulk_get_all_enable - Get and enable all clocks of the consumer (managed) + * @dev: device for clock "consumer" + * @clks: pointer to the clk_bulk_data table of consumer + * + * Returns success (0) or negative errno. + * + * This helper function allows drivers to get all clocks of the + * consumer and enables them in one operation with management. + * The clks will automatically be disabled and freed when the device + * is unbound. + */ + +int __must_check devm_clk_bulk_get_all_enable(struct device *dev, + struct clk_bulk_data **clks); + /** * devm_clk_get - lookup and obtain a managed reference to a clock producer. * @dev: device for clock "consumer" @@ -968,6 +984,12 @@ static inline int __must_check devm_clk_bulk_get_all(struct device *dev, return 0; } +static inline int __must_check devm_clk_bulk_get_all_enable(struct device *dev, + struct clk_bulk_data **clks) +{ + return 0; +} + static inline struct clk *devm_get_clk_from_child(struct device *dev, struct device_node *np, const char *con_id) { -- cgit v1.2.3 From ff773fd2199960ffab0caae07451fe0f12b05bb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 21 Feb 2024 19:22:09 +0100 Subject: clk: fixed-factor: add optional accuracy support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed factor clock reports the parent clock accuracy. Add flags and acc fields to `struct clk_fixed_factor` to support setting a fixed accuracy. The default if no flag is set is not changed: use the parent clock accuracy. Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20240221-mbly-clk-v7-1-31d4ce3630c3@bootlin.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1293c38ddb7f..7ddc952c8c67 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1084,18 +1084,28 @@ void of_fixed_factor_clk_setup(struct device_node *node); * @hw: handle between common and hardware-specific interfaces * @mult: multiplier * @div: divider + * @acc: fixed accuracy in ppb + * @flags: behavior modifying flags * * Clock with a fixed multiplier and divider. The output frequency is the * parent clock rate divided by div and multiplied by mult. - * Implements .recalc_rate, .set_rate and .round_rate + * Implements .recalc_rate, .set_rate, .round_rate and .recalc_accuracy + * + * Flags: + * * CLK_FIXED_FACTOR_FIXED_ACCURACY - Use the value in @acc instead of the + * parent clk accuracy. */ struct clk_fixed_factor { struct clk_hw hw; unsigned int mult; unsigned int div; + unsigned long acc; + unsigned int flags; }; +#define CLK_FIXED_FACTOR_FIXED_ACCURACY BIT(0) + #define to_clk_fixed_factor(_hw) container_of(_hw, struct clk_fixed_factor, hw) extern const struct clk_ops clk_fixed_factor_ops; -- cgit v1.2.3 From ae156a3633d377d43990eb539f8a007c0c2bf769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 21 Feb 2024 19:22:10 +0100 Subject: clk: fixed-factor: add fwname-based constructor functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add four functions to register clk_hw based on the fw_name field in clk_parent_data, ie the value in the DT property `clock-names`. There are variants for devm or not and passing an accuracy or not passing one: - clk_hw_register_fixed_factor_fwname - clk_hw_register_fixed_factor_with_accuracy_fwname - devm_clk_hw_register_fixed_factor_fwname - devm_clk_hw_register_fixed_factor_with_accuracy_fwname The `struct clk_parent_data` init is extracted from __clk_hw_register_fixed_factor to each calling function. It is required to allow each function to pass whatever field they want, not only index. Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20240221-mbly-clk-v7-2-31d4ce3630c3@bootlin.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 7ddc952c8c67..4a537260f655 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1116,10 +1116,24 @@ void clk_unregister_fixed_factor(struct clk *clk); struct clk_hw *clk_hw_register_fixed_factor(struct device *dev, const char *name, const char *parent_name, unsigned long flags, unsigned int mult, unsigned int div); +struct clk_hw *clk_hw_register_fixed_factor_fwname(struct device *dev, + struct device_node *np, const char *name, const char *fw_name, + unsigned long flags, unsigned int mult, unsigned int div); +struct clk_hw *clk_hw_register_fixed_factor_with_accuracy_fwname(struct device *dev, + struct device_node *np, const char *name, const char *fw_name, + unsigned long flags, unsigned int mult, unsigned int div, + unsigned long acc); void clk_hw_unregister_fixed_factor(struct clk_hw *hw); struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev, const char *name, const char *parent_name, unsigned long flags, unsigned int mult, unsigned int div); +struct clk_hw *devm_clk_hw_register_fixed_factor_fwname(struct device *dev, + struct device_node *np, const char *name, const char *fw_name, + unsigned long flags, unsigned int mult, unsigned int div); +struct clk_hw *devm_clk_hw_register_fixed_factor_with_accuracy_fwname(struct device *dev, + struct device_node *np, const char *name, const char *fw_name, + unsigned long flags, unsigned int mult, unsigned int div, + unsigned long acc); struct clk_hw *devm_clk_hw_register_fixed_factor_index(struct device *dev, const char *name, unsigned int index, unsigned long flags, unsigned int mult, unsigned int div); -- cgit v1.2.3 From e4ad2b0130eff1cc72f93ea7fd184b0e420f0736 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 14 Feb 2024 18:30:02 +0000 Subject: firmware: arm_scmi: Add clock check for extended config support SCMI v3.2 added support to set/get clock custom OEM types; such support is conditionally present, though, depending on an extended config attribute bit possibly advertised by the platform server on a per-domain base. Add a check to verify if OEM types are supported before allowing any kind of OEM-specific get/set operation. Also add a check around all the new v3.2 clock features. Signed-off-by: Cristian Marussi Link: https://lore.kernel.org/r/20240214183006.3403207-4-cristian.marussi@arm.com Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 9b9351e07a11..46a61173c91c 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -50,6 +50,7 @@ struct scmi_clock_info { bool state_ctrl_forbidden; bool rate_ctrl_forbidden; bool parent_ctrl_forbidden; + bool extended_config; union { struct { int num_rates; -- cgit v1.2.3 From 62092c428fb528fcd117a580216915af04df450e Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Wed, 14 Feb 2024 18:30:03 +0000 Subject: firmware: arm_scmi: Add standard clock OEM definitions Add a common enum to define the standard clock OEM types defined by the SCMI specification, so as to enable the configuration of such extended configuration properties with the existent clock protocol operations. Signed-off-by: Cristian Marussi Link: https://lore.kernel.org/r/20240214183006.3403207-5-cristian.marussi@arm.com Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 46a61173c91c..2ee94ff0320c 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -76,6 +76,13 @@ struct scmi_handle; struct scmi_device; struct scmi_protocol_handle; +enum scmi_clock_oem_config { + SCMI_CLOCK_CFG_DUTY_CYCLE = 0x1, + SCMI_CLOCK_CFG_PHASE, + SCMI_CLOCK_CFG_OEM_START = 0x80, + SCMI_CLOCK_CFG_OEM_END = 0xFF, +}; + /** * struct scmi_clk_proto_ops - represents the various operations provided * by SCMI Clock Protocol @@ -108,10 +115,11 @@ struct scmi_clk_proto_ops { int (*state_get)(const struct scmi_protocol_handle *ph, u32 clk_id, bool *enabled, bool atomic); int (*config_oem_get)(const struct scmi_protocol_handle *ph, u32 clk_id, - u8 oem_type, u32 *oem_val, u32 *attributes, - bool atomic); + enum scmi_clock_oem_config oem_type, + u32 *oem_val, u32 *attributes, bool atomic); int (*config_oem_set)(const struct scmi_protocol_handle *ph, u32 clk_id, - u8 oem_type, u32 oem_val, bool atomic); + enum scmi_clock_oem_config oem_type, + u32 oem_val, bool atomic); int (*parent_get)(const struct scmi_protocol_handle *ph, u32 clk_id, u32 *parent_id); int (*parent_set)(const struct scmi_protocol_handle *ph, u32 clk_id, u32 parent_id); }; -- cgit v1.2.3 From 8e7e247f64a1e0fee430aba28d9108f7598eb237 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 21 Feb 2024 10:05:33 +0100 Subject: timers: Introduce add_timer() variants which modify timer flags A timer might be used as a pinned timer (using add_timer_on()) and later on as non-pinned timer using add_timer(). When the "NOHZ timer pull at expiry model" is in place, the TIMER_PINNED flag is required to be used whenever a timer needs to expire on a dedicated CPU. Otherwise the flag must not be set if expiration on a dedicated CPU is not required. add_timer_on()'s behavior will be changed during the preparation patches for the "NOHZ timer pull at expiry model" to unconditionally set the TIMER_PINNED flag. To be able to clear/ set the flag when queueing a timer, two variants of add_timer() are introduced. This is a preparatory step and has no functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240221090548.36600-6-anna-maria@linutronix.de --- include/linux/timer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index f18a2f1eb79e..2be8be6dd317 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -165,6 +165,8 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires); #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) extern void add_timer(struct timer_list *timer); +extern void add_timer_local(struct timer_list *timer); +extern void add_timer_global(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int timer_delete_sync(struct timer_list *timer); -- cgit v1.2.3 From 7ee988770326fca440472200c3eb58935fe712f6 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Thu, 22 Feb 2024 11:37:10 +0100 Subject: timers: Implement the hierarchical pull model Placing timers at enqueue time on a target CPU based on dubious heuristics does not make any sense: 1) Most timer wheel timers are canceled or rearmed before they expire. 2) The heuristics to predict which CPU will be busy when the timer expires are wrong by definition. So placing the timers at enqueue wastes precious cycles. The proper solution to this problem is to always queue the timers on the local CPU and allow the non pinned timers to be pulled onto a busy CPU at expiry time. Therefore split the timer storage into local pinned and global timers: Local pinned timers are always expired on the CPU on which they have been queued. Global timers can be expired on any CPU. As long as a CPU is busy it expires both local and global timers. When a CPU goes idle it arms for the first expiring local timer. If the first expiring pinned (local) timer is before the first expiring movable timer, then no action is required because the CPU will wake up before the first movable timer expires. If the first expiring movable timer is before the first expiring pinned (local) timer, then this timer is queued into an idle timerqueue and eventually expired by another active CPU. To avoid global locking the timerqueues are implemented as a hierarchy. The lowest level of the hierarchy holds the CPUs. The CPUs are associated to groups of 8, which are separated per node. If more than one CPU group exist, then a second level in the hierarchy collects the groups. Depending on the size of the system more than 2 levels are required. Each group has a "migrator" which checks the timerqueue during the tick for remote expirable timers. If the last CPU in a group goes idle it reports the first expiring event in the group up to the next group(s) in the hierarchy. If the last CPU goes idle it arms its timer for the first system wide expiring timer to ensure that no timer event is missed. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240222103710.32582-1-anna-maria@linutronix.de --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 172d0a743e5d..7651904c6db5 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -231,6 +231,7 @@ enum cpuhp_state { CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE, CPUHP_AP_PERF_CSKY_ONLINE, + CPUHP_AP_TMIGR_ONLINE, CPUHP_AP_WATCHDOG_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RANDOM_ONLINE, -- cgit v1.2.3 From b2cf7507e18649a30512515ec0ca89f26b2c2d0f Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 21 Feb 2024 10:05:48 +0100 Subject: timers: Always queue timers on the local CPU The timer pull model is in place so we can remove the heuristics which try to guess the best target CPU at enqueue/modification time. All non pinned timers are queued on the local CPU in the separate storage and eventually pulled at expiry time to a remote CPU. Originally-by: Richard Cochran (linutronix GmbH) Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20240221090548.36600-21-anna-maria@linutronix.de --- include/linux/timer.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 2be8be6dd317..14a633ba61d6 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -36,16 +36,10 @@ * workqueue locking issues. It's not meant for executing random crap * with interrupts disabled. Abuse is monitored! * - * @TIMER_PINNED: A pinned timer will not be affected by any timer - * placement heuristics (like, NOHZ) and will always expire on the CPU - * on which the timer was enqueued. - * - * Note: Because enqueuing of timers can migrate the timer from one - * CPU to another, pinned timers are not guaranteed to stay on the - * initialy selected CPU. They move to the CPU on which the enqueue - * function is invoked via mod_timer() or add_timer(). If the timer - * should be placed on a particular CPU, then add_timer_on() has to be - * used. + * @TIMER_PINNED: A pinned timer will always expire on the CPU on which the + * timer was enqueued. When a particular CPU is required, add_timer_on() + * has to be used. Enqueue via mod_timer() and add_timer() is always done + * on the local CPU. */ #define TIMER_CPUMASK 0x0003FFFF #define TIMER_MIGRATING 0x00040000 -- cgit v1.2.3 From bb29fd7760ae39905127afd31fc83294625ff704 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 19 Jan 2024 11:22:22 +0000 Subject: mm/zswap: make sure each swapfile always have zswap rb-tree Patch series "mm/zswap: optimize the scalability of zswap rb-tree", v2. When testing the zswap performance by using kernel build -j32 in a tmpfs directory, I found the scalability of zswap rb-tree is not good, which is protected by the only spinlock. That would cause heavy lock contention if multiple tasks zswap_store/load concurrently. So a simple solution is to split the only one zswap rb-tree into multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M). This idea is from the commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"). Although this method can't solve the spinlock contention completely, it can mitigate much of that contention. Below is the results of kernel build in tmpfs with zswap shrinker enabled: linux-next zswap-lock-optimize real 1m9.181s 1m3.820s user 17m44.036s 17m40.100s sys 7m37.297s 4m54.622s So there are clearly improvements. And it's complementary with the ongoing zswap xarray conversion by Chris. Anyway, I think we can also merge this first, it's complementary IMHO. So I just refresh and resend this for further discussion. This patch (of 2): Not all zswap interfaces can handle the absence of the zswap rb-tree, actually only zswap_store() has handled it for now. To make things simple, we make sure each swapfile always have the zswap rb-tree prepared before being enabled and used. The preparation is unlikely to fail in practice, this patch just make it explicit. Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-0-b5cc55479090@bytedance.com Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-1-b5cc55479090@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Chris Li Signed-off-by: Andrew Morton --- include/linux/zswap.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 0b709f5bc65f..eca388229d9a 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -30,7 +30,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); -void zswap_swapon(int type); +int zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -51,7 +51,10 @@ static inline bool zswap_load(struct folio *folio) } static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline void zswap_swapon(int type) {} +static inline int zswap_swapon(int type) +{ + return 0; +} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} -- cgit v1.2.3 From 44c7c734a5132fc02f5584c7207f1d0c483f3ccd Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 19 Jan 2024 11:22:23 +0000 Subject: mm/zswap: split zswap rb-tree Each swapfile has one rb-tree to search the mapping of swp_entry_t to zswap_entry, that use a spinlock to protect, which can cause heavy lock contention if multiple tasks zswap_store/load concurrently. Optimize the scalability problem by splitting the zswap rb-tree into multiple rb-trees, each corresponds to SWAP_ADDRESS_SPACE_PAGES (64M), just like we did in the swap cache address_space splitting. Although this method can't solve the spinlock contention completely, it can mitigate much of that contention. Below is the results of kernel build in tmpfs with zswap shrinker enabled: linux-next zswap-lock-optimize real 1m9.181s 1m3.820s user 17m44.036s 17m40.100s sys 7m37.297s 4m54.622s So there are clearly improvements. Link: https://lkml.kernel.org/r/20240117-b4-zswap-lock-optimize-v2-2-b5cc55479090@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Chris Li Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index eca388229d9a..91895ce1fdbc 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -30,7 +30,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); -int zswap_swapon(int type); +int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); @@ -51,7 +51,7 @@ static inline bool zswap_load(struct folio *folio) } static inline void zswap_invalidate(int type, pgoff_t offset) {} -static inline int zswap_swapon(int type) +static inline int zswap_swapon(int type, unsigned long nr_pages) { return 0; } -- cgit v1.2.3 From 42d9358252e5d055223487d9f653c2a2ac859a2a Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Wed, 24 Jan 2024 12:03:49 -0800 Subject: mm/memory_hotplug: export mhp_supports_memmap_on_memory() In preparation for adding sysfs ABI to toggle memmap_on_memory semantics for drivers adding memory, export the mhp_supports_memmap_on_memory() helper. This allows drivers to check if memmap_on_memory support is available before trying to request it, and display an appropriate message if it isn't available. As part of this, remove the size argument to this - with recent updates to allow memmap_on_memory for larger ranges, and the internal splitting of altmaps into respective memory blocks, the size argument is meaningless. [akpm@linux-foundation.org: fix build] Link: https://lkml.kernel.org/r/20240124-vv-dax_abi-v7-4-20d16cb8d23d@intel.com Signed-off-by: Vishal Verma Acked-by: David Hildenbrand Suggested-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Jonathan Cameron Cc: Li Zhijian Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: Dan Williams Cc: Dave Jiang Cc: Dave Hansen Cc: Huang Ying Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ee00015575aa..7a9ff464608d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -137,6 +137,7 @@ struct mhp_params { bool mhp_range_allowed(u64 start, u64 size, bool need_mapping); struct range mhp_get_pluggable_range(bool need_mapping); +bool mhp_supports_memmap_on_memory(void); /* * Zone resizing functions @@ -278,6 +279,11 @@ static inline bool movable_node_is_enabled(void) return false; } +static inline bool mhp_supports_memmap_on_memory(void) +{ + return false; +} + static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {} static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {} static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {} -- cgit v1.2.3 From 9af47276ed83cc346263e56243756543a2a33c9d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 18:12:15 +0000 Subject: highmem: add kernel-doc for memcpy_*_folio() This was inadvertently skipped when adding the new functions. Link: https://lkml.kernel.org/r/20240124181217.1761674-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/highmem.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 451c1dff0e87..00341b56d291 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -439,6 +439,13 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * memcpy_from_folio - Copy a range of bytes from a folio. + * @to: The memory to copy to. + * @folio: The folio to read from. + * @offset: The first byte in the folio to read. + * @len: The number of bytes to copy. + */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { @@ -460,6 +467,13 @@ static inline void memcpy_from_folio(char *to, struct folio *folio, } while (len > 0); } +/** + * memcpy_to_folio - Copy a range of bytes to a folio. + * @folio: The folio to write to. + * @offset: The first byte in the folio to store to. + * @from: The memory to copy from. + * @len: The number of bytes to copy. + */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { -- cgit v1.2.3 From 5cec4eb7fad6fb1e9a3dd8403b558d1eff7490ff Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 26 Jan 2024 16:19:44 +0800 Subject: mm and cache_info: remove unnecessary CPU cache info update For each CPU hotplug event, we will update per-CPU data slice size and corresponding PCP configuration for every online CPU to make the implementation simple. But, Kyle reported that this takes tens seconds during boot on a machine with 34 zones and 3840 CPUs. So, in this patch, for each CPU hotplug event, we only update per-CPU data slice size and corresponding PCP configuration for the CPUs that share caches with the hotplugged CPU. With the patch, the system boot time reduces 67 seconds on the machine. Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages") Signed-off-by: "Huang, Ying" Originally-by: Kyle Meyer Reported-and-tested-by: Kyle Meyer Cc: Sudeep Holla Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..09e22091f1b0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); -void setup_pcp_cacheinfo(void); +void setup_pcp_cacheinfo(unsigned int cpu); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what -- cgit v1.2.3 From 7dbbc8f57d4ba3c369b692db9fd2c9653abf0bb5 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 26 Jan 2024 08:06:43 +0000 Subject: x86/mm: delete unused cpu argument to leave_mm() The argument is unused since commit 3d28ebceaffa ("x86/mm: Rework lazy TLB to track the actual loaded mm"), delete it. Link: https://lkml.kernel.org/r/20240126080644.1714297-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mmu_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index f2b7a3f04099..bbaec80c78c5 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -11,7 +11,7 @@ #endif #ifndef leave_mm -static inline void leave_mm(int cpu) { } +static inline void leave_mm(void) { } #endif /* -- cgit v1.2.3 From 3f798aa6121ab3eb572f96ab2d8558894d979a4c Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 28 Jan 2024 13:28:51 +0000 Subject: mm/list_lru: remove list_lru_putback() Since the only user zswap_lru_putback() has gone, remove list_lru_putback() too. Link: https://lkml.kernel.org/r/20240126-zswap-writeback-race-v2-3-b10479847099@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Yosry Ahmed Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index c679e6b293c4..f2882a820690 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -168,22 +168,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); -/** - * list_lru_putback: undo list_lru_isolate - * @lru: the lru pointer. - * @item: the item to put back. - * @nid: the node id of the sublist to put the item back to. - * @memcg: the cgroup of the sublist to put the item back to. - * - * Put back an isolated item into its original LRU. Note that unlike - * list_lru_add, this does not increment the node LRU count (as - * list_lru_isolate does not originally decrement this count). - * - * Since we might have dropped the LRU lock in between, recompute list_lru_one - * from the node's id and memcg. - */ -void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); -- cgit v1.2.3 From fa3bea4e1f8202d787709b7e3654eb0a99aed758 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 2 Feb 2024 12:02:37 -0500 Subject: mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf Signed-off-by: Gregory Price Co-developed-by: Rakie Kim Signed-off-by: Rakie Kim Co-developed-by: Honggyu Kim Signed-off-by: Honggyu Kim Co-developed-by: Hyeongtak Ji Signed-off-by: Hyeongtak Ji Co-developed-by: Srinivasulu Thanneeru Signed-off-by: Srinivasulu Thanneeru Co-developed-by: Ravi Jonnalagadda Signed-off-by: Ravi Jonnalagadda Reviewed-by: "Huang, Ying" Cc: Dan Williams Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..b9ce285d8c9c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1259,6 +1259,7 @@ struct task_struct { /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; + u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From a5e8131a0329673f70faee2e9ffb02e8a5bb3c89 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:33 +0100 Subject: arm64, powerpc, riscv, s390, x86: ptdump: refactor CONFIG_DEBUG_WX All architectures using the core ptdump functionality also implement CONFIG_DEBUG_WX, and they all do it more or less the same way, with a function called debug_checkwx() that is called by mark_rodata_ro(), which is a substitute to ptdump_check_wx() when CONFIG_DEBUG_WX is set and a no-op otherwise. Refactor by centrally defining debug_checkwx() in linux/ptdump.h and call debug_checkwx() immediately after calling mark_rodata_ro() instead of calling it at the end of every mark_rodata_ro(). On x86_32, mark_rodata_ro() first checks __supported_pte_mask has _PAGE_NX before calling debug_checkwx(). Now the check is inside the callee ptdump_walk_pgd_level_checkwx(). On powerpc_64, mark_rodata_ro() bails out early before calling ptdump_check_wx() when the MMU doesn't have KERNEL_RO feature. The check is now also done in ptdump_check_wx() as it is called outside mark_rodata_ro(). Link: https://lkml.kernel.org/r/a59b102d7964261d31ead0316a9f18628e4e7a8e.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Alexandre Ghiti Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/ptdump.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 2a3a95586425..c10513739bf9 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -19,5 +19,12 @@ struct ptdump_state { }; void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); +void ptdump_check_wx(void); + +static inline void debug_checkwx(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_WX)) + ptdump_check_wx(); +} #endif /* _LINUX_PTDUMP_H */ -- cgit v1.2.3 From 6cdc82db0c044d36137dd98f33e8aa0b8742987f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:35 +0100 Subject: mm: ptdump: have ptdump_check_wx() return bool Have ptdump_check_wx() return true when the check is successful or false otherwise. [akpm@linux-foundation.org: fix a couple of build issues (x86_64 allmodconfig)] Link: https://lkml.kernel.org/r/7943149fe955458cb7b57cd483bf41a3aad94684.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/ptdump.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index c10513739bf9..8dbd51ea8626 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -18,8 +18,11 @@ struct ptdump_state { const struct ptdump_range *range; }; +bool ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, + bool checkwx, bool dmesg); void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); -void ptdump_check_wx(void); +bool ptdump_check_wx(void); static inline void debug_checkwx(void) { -- cgit v1.2.3 From 6cdfa1d5d5d8285108495c33588c48cdda81b647 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:42 +0100 Subject: mm/pgtable: make pte_next_pfn() independent of set_ptes() Let's provide pte_next_pfn(), independently of set_ptes(). This allows for using the generic pte_next_pfn() version in some arch-specific set_ptes() implementations, and prepares for reusing pte_next_pfn() in other context. Link: https://lkml.kernel.org/r/20240129124649.189745-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Christophe Leroy Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f6d0e3513948..351cd9dc7194 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif -#ifndef set_ptes #ifndef pte_next_pfn static inline pte_t pte_next_pfn(pte_t pte) @@ -221,6 +220,7 @@ static inline pte_t pte_next_pfn(pte_t pte) } #endif +#ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. -- cgit v1.2.3 From f8d937761d65c87e9987b88ea7beb7bddc333a0e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jan 2024 13:46:47 +0100 Subject: mm/memory: optimize fork() with PTE-mapped THP Let's implement PTE batching when consecutive (present) PTEs map consecutive pages of the same large folio, and all other PTE bits besides the PFNs are equal. We will optimize folio_pte_batch() separately, to ignore selected PTE bits. This patch is based on work by Ryan Roberts. Use __always_inline for __copy_present_ptes() and keep the handling for single PTEs completely separate from the multi-PTE case: we really want the compiler to optimize for the single-PTE case with small folios, to not degrade performance. Note that PTE batching will never exceed a single page table and will always stay within VMA boundaries. Further, processing PTE-mapped THP that maybe pinned and have PageAnonExclusive set on at least one subpage should work as expected, but there is room for improvement: We will repeatedly (1) detect a PTE batch (2) detect that we have to copy a page (3) fall back and allocate a single page to copy a single page. For now we won't care as pinned pages are a corner case, and we should rather look into maintaining only a single PageAnonExclusive bit for large folios. Link: https://lkml.kernel.org/r/20240129124649.189745-14-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 351cd9dc7194..aab227e12493 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -650,6 +650,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef wrprotect_ptes +/** + * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to write-protect. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_set_wrprotect(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + for (;;) { + ptep_set_wrprotect(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings -- cgit v1.2.3 From 09dacb7875395b8761cde921fff767a7cd3ab862 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 2 Feb 2024 22:23:18 +0100 Subject: mm: reduce dependencies on "page_counter.h" does not need . is enough to get LONG_MAX. Files that include page_counter.h are limited. They have been compile tested or checked. $ git grep page_counter\.h include/linux/hugetlb_cgroup.h: struct page_counter hugepage[HUGE_MAX_HSTATE]; --> all files that include it have been compile tested include/linux/memcontrol.h:#include --> has been added, to be safe include/net/sock.h:#include --> already include mm/hugetlb_cgroup.c:#include mm/memcontrol.c:#include mm/page_counter.c:#include --> compile tested Link: https://lkml.kernel.org/r/adfdbe21c4d06400d7bd802868762deb85cae8b6.1706908921.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 + include/linux/page_counter.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 20ff87f8e001..4e4caeaea404 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c141ea9a95ef..8cd858d912c4 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include struct page_counter { -- cgit v1.2.3 From 73307523c9bbc4e3b35f0058cdbc15e32bd83c52 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 5 Feb 2024 10:49:29 +0530 Subject: mm/cma: make MAX_CMA_AREAS = CONFIG_CMA_AREAS There is no real difference between the global area, and other additionally configured CMA areas via CONFIG_CMA_AREAS that always defaults without user input. This makes MAX_CMA_AREAS same as CONFIG_CMA_AREAS, also incrementing its default values, thus maintaining current default for MAX_CMA_AREAS both for UMA and NUMA systems. Link: https://lkml.kernel.org/r/20240205051929.298559-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/cma.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index 63873b93deaa..9db877506ea8 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -6,12 +6,8 @@ #include #include -/* - * There is always at least global CMA area and a few optional - * areas configured in kernel .config. - */ #ifdef CONFIG_CMA_AREAS -#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) +#define MAX_CMA_AREAS CONFIG_CMA_AREAS #endif #define CMA_MAX_NAME 64 -- cgit v1.2.3 From 0827a1fb143fae588cb6f5b9a97c405d6c2ddec9 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:00 +0000 Subject: mm/zswap: invalidate zswap entry when swap entry free During testing I found there are some times the zswap_writeback_entry() return -ENOMEM, which is not we expected: bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' @[-12]: 1563 @[0]: 277221 The reason is that __read_swap_cache_async() return NULL because swapcache_prepare() failed. The reason is that we won't invalidate zswap entry when swap entry freed to the per-cpu pool, these zswap entries are still on the zswap tree and lru list. This patch moves the invalidation ahead to when swap entry freed to the per-cpu pool, since there is no any benefit to leave trashy zswap entry on the tree and lru list. With this patch: bpftrace -e 'kr:zswap_writeback_entry {@[(int32)retval]=count()}' @[0]: 259744 Note: large folio can't have zswap entry for now, so don't bother to add zswap entry invalidation in the large folio swap free path. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-2-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 91895ce1fdbc..341aea490070 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -29,7 +29,7 @@ struct zswap_lruvec_state { bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); -void zswap_invalidate(int type, pgoff_t offset); +void zswap_invalidate(swp_entry_t swp); int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); @@ -50,7 +50,7 @@ static inline bool zswap_load(struct folio *folio) return false; } -static inline void zswap_invalidate(int type, pgoff_t offset) {} +static inline void zswap_invalidate(swp_entry_t swp) {} static inline int zswap_swapon(int type, unsigned long nr_pages) { return 0; -- cgit v1.2.3 From b49547ade38a63ff39c9fbc53fb38622cb63854a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sun, 4 Feb 2024 03:06:01 +0000 Subject: mm/zswap: stop lru list shrinking when encounter warm region When the shrinker encounter an existing folio in swap cache, it means we are shrinking into the warmer region. We should terminate shrinking if we're in the dynamic shrinker context. This patch add LRU_STOP to support this, to avoid overshrinking. Link: https://lkml.kernel.org/r/20240201-b4-zswap-invalidate-entry-v2-3-99d4084260a0@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Nhat Pham Reviewed-by: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index f2882a820690..792b67ceb631 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -24,6 +24,8 @@ enum lru_status { LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ + LRU_STOP, /* stop lru list walking. May drop the lock + internally, but has to return locked. */ }; struct list_lru_one { -- cgit v1.2.3 From 0c32c9f7a58e7736b27f9d6766e9f21d34a26eff Mon Sep 17 00:00:00 2001 From: John Groves Date: Mon, 5 Feb 2024 18:57:37 -0600 Subject: memremap.h: correct an error in a comment It tried to send me off to memory_hotplug.h for an enum that is a few lines above... Link: https://lkml.kernel.org/r/dba0f5f01162d6fa16e4da2a9fede7f97080e92d.1707179960.git.john@groves.net Signed-off-by: John Groves Reviewed-by: Dan Williams Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memremap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 9837f3e6fb95..3f7143ade32c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -109,7 +109,7 @@ struct dev_pagemap_ops { * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping * @done: completion for @ref - * @type: memory type: see MEMORY_* in memory_hotplug.h + * @type: memory type: see MEMORY_* above in memremap.h * @flags: PGMAP_* flags to specify defailed behavior * @vmemmap_shift: structural definition of how the vmemmap page metadata * is populated, specifically the metadata page order. -- cgit v1.2.3 From cfb837e8433179995709578ca5b4741adcd54ec7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 12 Feb 2024 19:29:51 +0100 Subject: mm: document memalloc_noreclaim_save() and memalloc_pin_save() The memalloc_noreclaim_save() function currently has no documentation comment, so the implications of its usage are not obvious. Namely that it not only prevents entering reclaim (as the name suggests), but also allows using all memory reserves and thus should be only used in contexts that are allocating memory to free memory. This may lead to new improper usages being added. Thus add a documenting comment, based on the description of __GFP_MEMALLOC. While at it, also document memalloc_pin_save() so that all the memalloc_ scopes are documented. For those already documented, add missing Return: descriptions, and mark Context: description per kernel-docs style guide. In the comments describing the relevant PF_MEMALLOC flags, refer to their scope setting functions. [vbabka@suse.cz: fix issues that Mike pointed out] Link: https://lkml.kernel.org/r/20240215095827.13756-2-vbabka@suse.cz Link: https://lkml.kernel.org/r/20240212182950.32730-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Mike Rapoport (IBM) Acked-by: Michal Hocko Cc: Kent Overstreet Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/sched.h | 9 ++++---- include/linux/sched/mm.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b9ce285d8c9c..998861865b84 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1624,15 +1624,15 @@ extern struct pid *cad_pid; #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ -#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ -#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ -#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ @@ -1642,7 +1642,8 @@ extern struct pid *cad_pid; #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. + * See memalloc_pin_save() */ #define PF__HOLE__20000000 0x20000000 #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9a19f1b42f64..7a4066d22883 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -315,7 +315,8 @@ static inline void might_alloc(gfp_t gfp_mask) * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * - * This function is safe to be used from any context. + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_noio_restore. */ static inline unsigned int memalloc_noio_save(void) { @@ -346,7 +347,8 @@ static inline void memalloc_noio_restore(unsigned int flags) * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * - * This function is safe to be used from any context. + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_nofs_restore. */ static inline unsigned int memalloc_nofs_save(void) { @@ -368,6 +370,29 @@ static inline void memalloc_nofs_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; } +/** + * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope. + * + * This function marks the beginning of the __GFP_MEMALLOC allocation scope. + * All further allocations will implicitly add the __GFP_MEMALLOC flag, which + * prevents entering reclaim and allows access to all memory reserves. This + * should only be used when the caller guarantees the allocation will allow more + * memory to be freed very shortly, i.e. it needs to allocate some memory in + * the process of freeing memory, and cannot reclaim due to potential recursion. + * + * Users of this scope have to be extremely careful to not deplete the reserves + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. Usage of a + * pre-allocated pool (e.g. mempool) should be always considered before using + * this scope. + * + * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC + * + * Context: This function should not be used in an interrupt context as that one + * does not give PF_MEMALLOC access to reserves. + * See __gfp_pfmemalloc_flags(). + * Return: The saved flags to be passed to memalloc_noreclaim_restore. + */ static inline unsigned int memalloc_noreclaim_save(void) { unsigned int flags = current->flags & PF_MEMALLOC; @@ -375,11 +400,29 @@ static inline unsigned int memalloc_noreclaim_save(void) return flags; } +/** + * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope. + * @flags: Flags to restore. + * + * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save + * function. Always make sure that the given flags is the return value from the + * pairing memalloc_noreclaim_save call. + */ static inline void memalloc_noreclaim_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC) | flags; } +/** + * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope. + * + * This function marks the beginning of the ~__GFP_MOVABLE allocation scope. + * All further allocations will implicitly remove the __GFP_MOVABLE flag, which + * will constraint the allocations to zones that allow long term pinning, i.e. + * not ZONE_MOVABLE zones. + * + * Return: The saved flags to be passed to memalloc_pin_restore. + */ static inline unsigned int memalloc_pin_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_PIN; @@ -388,6 +431,14 @@ static inline unsigned int memalloc_pin_save(void) return flags; } +/** + * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope. + * @flags: Flags to restore. + * + * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function. + * Always make sure that the given flags is the return value from the pairing + * memalloc_pin_save call. + */ static inline void memalloc_pin_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; -- cgit v1.2.3 From cc25bbe10a86a7fe3ec8468fb01e579e6216d7e1 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 14 Feb 2024 06:05:37 +0000 Subject: mm/mglru: improve struct lru_gen_mm_walk Rename max_seq to seq in struct lru_gen_mm_walk to keep consistent with struct lru_gen_mm_state. Note that seq is not always up to date with max_seq from lru_gen_folio. No functional changes. Link: https://lkml.kernel.org/r/20240214060538.3524462-5-kinseyho@google.com Signed-off-by: Kinsey Ho Cc: Aneesh Kumar K.V Cc: Donet Tom Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a497f189d988..633812a1d220 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -464,7 +464,7 @@ enum { #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { - /* set to max_seq after each iteration */ + /* synced with max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; @@ -479,8 +479,8 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_folio */ - unsigned long max_seq; + /* max_seq from lru_gen_folio: can be out of date */ + unsigned long seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ -- cgit v1.2.3 From 1cbcb564f5b67cee2fc2f78132b9733118a79c6d Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 5 Feb 2024 14:48:24 +0200 Subject: net/mlx5: Add the IFC related bits for query tracker Add the IFC related bits for query tracker. Signed-off-by: Yishai Hadas Reviewed-by: Kevin Tian Acked-by: Leon Romanovsky Link: https://lore.kernel.org/r/20240205124828.232701-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/mlx5/mlx5_ifc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c726f90ab752..0e513e372bf0 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -12672,6 +12672,11 @@ struct mlx5_ifc_modify_page_track_obj_in_bits { struct mlx5_ifc_page_track_bits obj_context; }; +struct mlx5_ifc_query_page_track_obj_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; + struct mlx5_ifc_page_track_bits obj_context; +}; + struct mlx5_ifc_msecq_reg_bits { u8 reserved_at_0[0x20]; -- cgit v1.2.3 From 4de676d494cd8fb2b4c65e58c19ebbdb36673957 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Tue, 20 Feb 2024 17:20:53 +0530 Subject: vfio/pci: rename and export do_io_rw() do_io_rw() is used to read/write to the device MMIO. The grace hopper VFIO PCI variant driver require this functionality to read/write to its memory. Rename this as vfio_pci_core functions and export as GPL. Reviewed-by: Kevin Tian Reviewed-by: Yishai Hadas Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20240220115055.23546-2-ankita@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 85e84b92751b..cf9480a31f3e 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -130,7 +130,10 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); - +ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, + void __iomem *io, char __user *buf, + loff_t off, size_t count, size_t x_start, + size_t x_end, bool iswrite); #define VFIO_IOWRITE_DECLATION(size) \ int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \ bool test_mem, u##size val, void __iomem *io); -- cgit v1.2.3 From 30e920e1debb437e5aea7a4ccdab61634354297a Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Tue, 20 Feb 2024 17:20:54 +0530 Subject: vfio/pci: rename and export range_intersect_range range_intersect_range determines an overlap between two ranges. If an overlap, the helper function returns the overlapping offset and size. The VFIO PCI variant driver emulates the PCI config space BAR offset registers. These offset may be accessed for read/write with a variety of lengths including sub-word sizes from sub-word offsets. The driver makes use of this helper function to read/write the targeted part of the emulated register. Make this a vfio_pci_core function, rename and export as GPL. Also update references in virtio driver. Reviewed-by: Kevin Tian Reviewed-by: Yishai Hadas Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20240220115055.23546-3-ankita@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index cf9480a31f3e..a2c8b8bba711 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -134,6 +134,11 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, size_t x_end, bool iswrite); +bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, + loff_t reg_start, size_t reg_cnt, + loff_t *buf_offset, + size_t *intersect_count, + size_t *register_offset); #define VFIO_IOWRITE_DECLATION(size) \ int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \ bool test_mem, u##size val, void __iomem *io); -- cgit v1.2.3 From da510964c095cb5e070800ef38752c453d2aa71d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:31 +0100 Subject: mm/mmu_gather: define ENCODED_PAGE_FLAG_DELAY_RMAP Nowadays, encoded pages are only used in mmu_gather handling. Let's update the documentation, and define ENCODED_PAGE_BIT_DELAY_RMAP. While at it, rename ENCODE_PAGE_BITS to ENCODED_PAGE_BITS. If encoded page pointers would ever be used in other context again, we'd likely want to change the defines to reflect their context (e.g., ENCODED_PAGE_FLAG_MMU_GATHER_DELAY_RMAP). For now, let's keep it simple. This is a preparation for using the remaining spare bit to indicate that the next item in an array of encoded pages is a "nr_pages" argument and not an encoded page. Link: https://lkml.kernel.org/r/20240214204435.167852-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8b611e13153e..1b89eec0d6df 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -210,8 +210,8 @@ struct page { * * An 'encoded_page' pointer is a pointer to a regular 'struct page', but * with the low bits of the pointer indicating extra context-dependent - * information. Not super-common, but happens in mmu_gather and mlock - * handling, and this acts as a type system check on that use. + * information. Only used in mmu_gather handling, and this acts as a type + * system check on that use. * * We only really have two guaranteed bits in general, although you could * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -220,21 +220,26 @@ struct page { * Use the supplied helper functions to endcode/decode the pointer and bits. */ struct encoded_page; -#define ENCODE_PAGE_BITS 3ul + +#define ENCODED_PAGE_BITS 3ul + +/* Perform rmap removal after we have flushed the TLB. */ +#define ENCODED_PAGE_BIT_DELAY_RMAP 1ul + static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) { - BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); + BUILD_BUG_ON(flags > ENCODED_PAGE_BITS); return (struct encoded_page *)(flags | (unsigned long)page); } static inline unsigned long encoded_page_flags(struct encoded_page *page) { - return ENCODE_PAGE_BITS & (unsigned long)page; + return ENCODED_PAGE_BITS & (unsigned long)page; } static inline struct page *encoded_page_ptr(struct encoded_page *page) { - return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); + return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page); } /* -- cgit v1.2.3 From d7f861b9c43aadbe384ab1382d2e76750bedc91e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:33 +0100 Subject: mm/mmu_gather: add __tlb_remove_folio_pages() Add __tlb_remove_folio_pages(), which will remove multiple consecutive pages that belong to the same large folio, instead of only a single page. We'll be using this function when optimizing unmapping/zapping of large folios that are mapped by PTEs. We're using the remaining spare bit in an encoded_page to indicate that the next enoced page in an array contains actually shifted "nr_pages". Teach swap/freeing code about putting multiple folio references, and delayed rmap handling to remove page ranges of a folio. This extension allows for still gathering almost as many small folios as we used to (-1, because we have to prepare for a possibly bigger next entry), but still allows for gathering consecutive pages that belong to the same large folio. Note that we don't pass the folio pointer, because it is not required for now. Further, we don't support page_size != PAGE_SIZE, it won't be required for simple PTE batching. We have to provide a separate s390 implementation, but it's fairly straight forward. Another, more invasive and likely more expensive, approach would be to use folio+range or a PFN range instead of page+nr_pages. But, we should do that consistently for the whole mmu_gather. For now, let's keep it simple and add "nr_pages" only. Note that it is now possible to gather significantly more pages: In the past, we were able to gather ~10000 pages, now we can also gather ~5000 folio fragments that span multiple pages. A folio fragment on x86-64 can span up to 512 pages (2 MiB THP) and on arm64 with 64k in theory 8192 pages (512 MiB THP). Gathering more memory is not considered something we should worry about, especially because these are already corner cases. While we can gather more total memory, we won't free more folio fragments. As long as page freeing time primarily only depends on the number of involved folios, there is no effective change for !preempt configurations. However, we'll adjust tlb_batch_pages_flush() separately to handle corner cases where page freeing time grows proportionally with the actual memory size. Link: https://lkml.kernel.org/r/20240214204435.167852-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1b89eec0d6df..a7223ba3ea1e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -226,6 +226,15 @@ struct encoded_page; /* Perform rmap removal after we have flushed the TLB. */ #define ENCODED_PAGE_BIT_DELAY_RMAP 1ul +/* + * The next item in an encoded_page array is the "nr_pages" argument, specifying + * the number of consecutive pages starting from this page, that all belong to + * the same folio. For example, "nr_pages" corresponds to the number of folio + * references that must be dropped. If this bit is not set, "nr_pages" is + * implicitly 1. + */ +#define ENCODED_PAGE_BIT_NR_PAGES_NEXT 2ul + static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) { BUILD_BUG_ON(flags > ENCODED_PAGE_BITS); @@ -242,6 +251,17 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page); } +static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr) +{ + VM_WARN_ON_ONCE((nr << 2) >> 2 != nr); + return (struct encoded_page *)(nr << 2); +} + +static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page) +{ + return ((unsigned long)page) >> 2; +} + /* * A swap entry has to fit into a "unsigned long", as the entry is hidden * in the "index" field of the swapper address space. -- cgit v1.2.3 From 10ebac4f95e7a9951c453d6c66d9beb5a35db338 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 14 Feb 2024 21:44:35 +0100 Subject: mm/memory: optimize unmap/zap with PTE-mapped THP Similar to how we optimized fork(), let's implement PTE batching when consecutive (present) PTEs map consecutive pages of the same large folio. Most infrastructure we need for batching (mmu gather, rmap) is already there. We only have to add get_and_clear_full_ptes() and clear_full_ptes(). Similarly, extend zap_install_uffd_wp_if_needed() to process a PTE range. We won't bother sanity-checking the mapcount of all subpages, but only check the mapcount of the first subpage we process. If there is a real problem hiding somewhere, we can trigger it simply by using small folios, or when we zap single pages of a large folio. Ideally, we had that check in rmap code (including for delayed rmap), but then we cannot print the PTE. Let's keep it simple for now. If we ever have a cheap folio_mapcount(), we might just want to check for underflows there. To keep small folios as fast as possible force inlining of a specialized variant using __always_inline with nr=1. Link: https://lkml.kernel.org/r/20240214204435.167852-11-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Ryan Roberts Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Peter Zijlstra (Intel) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index aab227e12493..49ab1f73b5c2 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -580,6 +580,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } #endif +#ifndef get_and_clear_full_ptes +/** + * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the + * returned PTE. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = ptep_get_and_clear_full(mm, addr, ptep, full); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} +#endif + +#ifndef clear_full_ptes +/** + * clear_full_ptes - Clear present PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + ptep_get_and_clear_full(mm, addr, ptep, full); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif /* * If two threads concurrently fault at the same page, the thread that -- cgit v1.2.3 From 6280d7317ccae19c776a3b6cf9848c964f958091 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:48 +0000 Subject: mm: clarify the spec for set_ptes() Patch series "Transparent Contiguous PTEs for User Mappings", v6. This is a series to opportunistically and transparently use contpte mappings (set the contiguous bit in ptes) for user memory when those mappings meet the requirements. The change benefits arm64, but there is some (very) minor refactoring for x86 to enable its integration with core-mm. It is part of a wider effort to improve performance by allocating and mapping variable-sized blocks of memory (folios). One aim is for the 4K kernel to approach the performance of the 16K kernel, but without breaking compatibility and without the associated increase in memory. Another aim is to benefit the 16K and 64K kernels by enabling 2M THP, since this is the contpte size for those kernels. We have good performance data that demonstrates both aims are being met (see below). Of course this is only one half of the change. We require the mapped physical memory to be the correct size and alignment for this to actually be useful (i.e. 64K for 4K pages, or 2M for 16K/64K pages). Fortunately folios are solving this problem for us. Filesystems that support it (XFS, AFS, EROFS, tmpfs, ...) will allocate large folios up to the PMD size today, and more filesystems are coming. And for anonymous memory, "multi-size THP" is now upstream. Patch Layout ============ In this version, I've split the patches to better show each optimization: - 1-2: mm prep: misc code and docs cleanups - 3-6: mm,arm64,x86 prep: Add pte_advance_pfn() and make pte_next_pfn() a generic wrapper around it - 7-11: arm64 prep: Refactor ptep helpers into new layer - 12: functional contpte implementation - 23-18: various optimizations on top of the contpte implementation Testing ======= I've tested this series on both Ampere Altra (bare metal) and Apple M2 (VM): - mm selftests (inc new tests written for multi-size THP); no regressions - Speedometer Java script benchmark in Chromium web browser; no issues - Kernel compilation; no issues - Various tests under high memory pressure with swap enabled; no issues Performance =========== High Level Use Cases ~~~~~~~~~~~~~~~~~~~~ First some high level use cases (kernel compilation and speedometer JavaScript benchmarks). These are running on Ampere Altra (I've seen similar improvements on Android/Pixel 6). baseline: mm-unstable (mTHP switched off) mTHP: + enable 16K, 32K, 64K mTHP sizes "always" mTHP + contpte: + this series mTHP + contpte + exefolio: + patch at [6], which series supports Kernel Compilation with -j8 (negative is faster): | kernel | real-time | kern-time | user-time | |---------------------------|-----------|-----------|-----------| | baseline | 0.0% | 0.0% | 0.0% | | mTHP | -5.0% | -39.1% | -0.7% | | mTHP + contpte | -6.0% | -41.4% | -1.5% | | mTHP + contpte + exefolio | -7.8% | -43.1% | -3.4% | Kernel Compilation with -j80 (negative is faster): | kernel | real-time | kern-time | user-time | |---------------------------|-----------|-----------|-----------| | baseline | 0.0% | 0.0% | 0.0% | | mTHP | -5.0% | -36.6% | -0.6% | | mTHP + contpte | -6.1% | -38.2% | -1.6% | | mTHP + contpte + exefolio | -7.4% | -39.2% | -3.2% | Speedometer (positive is faster): | kernel | runs_per_min | |:--------------------------|--------------| | baseline | 0.0% | | mTHP | 1.5% | | mTHP + contpte | 3.2% | | mTHP + contpte + exefolio | 4.5% | Micro Benchmarks ~~~~~~~~~~~~~~~~ The following microbenchmarks are intended to demonstrate the performance of fork() and munmap() do not regress. I'm showing results for order-0 (4K) mappings, and for order-9 (2M) PTE-mapped THP. Thanks to David for sharing his benchmarks. baseline: mm-unstable + batch zap [7] series contpte-basic: + patches 0-19; functional contpte implementation contpte-batch: + patches 20-23; implement new batched APIs contpte-inline: + patch 24; __always_inline to help compiler contpte-fold: + patch 25; fold contpte mapping when sensible Primary platform is Ampere Altra bare metal. I'm also showing results for M2 VM (on top of MacOS) for reference, although experience suggests this might not be the most reliable for performance numbers of this sort: | FORK | order-0 | order-9 | | Ampere Altra |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 2.7% | 0.0% | 0.2% | | contpte-basic | 6.3% | 1.4% | 1948.7% | 0.2% | | contpte-batch | 7.6% | 2.0% | -1.9% | 0.4% | | contpte-inline | 3.6% | 1.5% | -1.0% | 0.2% | | contpte-fold | 4.6% | 2.1% | -1.8% | 0.2% | | MUNMAP | order-0 | order-9 | | Ampere Altra |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 0.5% | 0.0% | 0.3% | | contpte-basic | 1.8% | 0.3% | 1104.8% | 0.1% | | contpte-batch | -0.3% | 0.4% | 2.7% | 0.1% | | contpte-inline | -0.1% | 0.6% | 0.9% | 0.1% | | contpte-fold | 0.1% | 0.6% | 0.8% | 0.1% | | FORK | order-0 | order-9 | | Apple M2 VM |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 1.4% | 0.0% | 0.8% | | contpte-basic | 6.8% | 1.2% | 469.4% | 1.4% | | contpte-batch | -7.7% | 2.0% | -8.9% | 0.7% | | contpte-inline | -6.0% | 2.1% | -6.0% | 2.0% | | contpte-fold | 5.9% | 1.4% | -6.4% | 1.4% | | MUNMAP | order-0 | order-9 | | Apple M2 VM |------------------------|------------------------| | (pte-map) | mean | stdev | mean | stdev | |----------------|------------|-----------|------------|-----------| | baseline | 0.0% | 0.6% | 0.0% | 0.4% | | contpte-basic | 1.6% | 0.6% | 233.6% | 0.7% | | contpte-batch | 1.9% | 0.3% | -3.9% | 0.4% | | contpte-inline | 2.2% | 0.8% | -1.6% | 0.9% | | contpte-fold | 1.5% | 0.7% | -1.7% | 0.7% | Misc ~~~~ John Hubbard at Nvidia has indicated dramatic 10x performance improvements for some workloads at [8], when using 64K base page kernel. [1] https://lore.kernel.org/linux-arm-kernel/20230622144210.2623299-1-ryan.roberts@arm.com/ [2] https://lore.kernel.org/linux-arm-kernel/20231115163018.1303287-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-arm-kernel/20231204105440.61448-1-ryan.roberts@arm.com/ [4] https://lore.kernel.org/lkml/20231218105100.172635-1-ryan.roberts@arm.com/ [5] https://lore.kernel.org/linux-mm/633af0a7-0823-424f-b6ef-374d99483f05@arm.com/ [6] https://lore.kernel.org/lkml/08c16f7d-f3b3-4f22-9acc-da943f647dc3@arm.com/ [7] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com/ [8] https://lore.kernel.org/linux-mm/c507308d-bdd4-5f9e-d4ff-e96e4520be85@nvidia.com/ [9] https://gitlab.arm.com/linux-arm/linux-rr/-/tree/features/granule_perf/contpte-lkml_v6 This patch (of 18): set_ptes() spec implies that it can only be used to set a present pte because it interprets the PFN field to increment it. However, set_pte_at() has been implemented on top of set_ptes() since set_ptes() was introduced, and set_pte_at() allows setting a pte to a not-present state. So clarify the spec to state that when nr==1, new state of pte may be present or not present. When nr>1, new state of all ptes must be present. While we are at it, tighten the spec to set requirements around the initial state of ptes; when nr==1 it may be either present or not-present. But when nr>1 all ptes must initially be not-present. All set_ptes() callsites already conform to this requirement. Stating it explicitly is useful because it allows for a simplification to the upcoming arm64 contpte implementation. Link: https://lkml.kernel.org/r/20240215103205.2607016-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240215103205.2607016-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 49ab1f73b5c2..231370e1b80f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -229,6 +229,10 @@ static inline pte_t pte_next_pfn(pte_t pte) * @pte: Page table entry for the first page. * @nr: Number of pages to map. * + * When nr==1, initial state of pte may be present or not present, and new state + * may be present or not present. When nr>1, initial state of all ptes must be + * not present, and new state must be present. + * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * -- cgit v1.2.3 From 583ceaaa339960e673ac0029f323bb1c6ffc96d7 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:50 +0000 Subject: mm: introduce pte_advance_pfn() and use for pte_next_pfn() The goal is to be able to advance a PTE by an arbitrary number of PFNs. So introduce a new API that takes a nr param. Define the default implementation here and allow for architectures to override. pte_next_pfn() becomes a wrapper around pte_advance_pfn(). Follow up commits will convert each overriding architecture's pte_next_pfn() to pte_advance_pfn(). Link: https://lkml.kernel.org/r/20240215103205.2607016-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 231370e1b80f..b7ac8358f2aa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,14 +212,17 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif - #ifndef pte_next_pfn -static inline pte_t pte_next_pfn(pte_t pte) +#ifndef pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); } #endif +#define pte_next_pfn(pte) pte_advance_pfn(pte, 1) +#endif + #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. -- cgit v1.2.3 From fb23bf6bd288db3187c27b971e558a3e9f70ae96 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:53 +0000 Subject: mm: tidy up pte_next_pfn() definition Now that the all architecture overrides of pte_next_pfn() have been replaced with pte_advance_pfn(), we can simplify the definition of the generic pte_next_pfn() macro so that it is unconditionally defined. Link: https://lkml.kernel.org/r/20240215103205.2607016-7-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b7ac8358f2aa..bc005d84f764 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif -#ifndef pte_next_pfn #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { @@ -221,7 +220,6 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) #endif #define pte_next_pfn(pte) pte_advance_pfn(pte, 1) -#endif #ifndef set_ptes /** -- cgit v1.2.3 From 4602e5757bcceb231c3a13c36c373ad4a750eddb Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:59 +0000 Subject: arm64/mm: wire up PTE_CONT for user mappings With the ptep API sufficiently refactored, we can now introduce a new "contpte" API layer, which transparently manages the PTE_CONT bit for user mappings. In this initial implementation, only suitable batches of PTEs, set via set_ptes(), are mapped with the PTE_CONT bit. Any subsequent modification of individual PTEs will cause an "unfold" operation to repaint the contpte block as individual PTEs before performing the requested operation. While, a modification of a single PTE could cause the block of PTEs to which it belongs to become eligible for "folding" into a contpte entry, "folding" is not performed in this initial implementation due to the costs of checking the requirements are met. Due to this, contpte mappings will degrade back to normal pte mappings over time if/when protections are changed. This will be solved in a future patch. Since a contpte block only has a single access and dirty bit, the semantic here changes slightly; when getting a pte (e.g. ptep_get()) that is part of a contpte mapping, the access and dirty information are pulled from the block (so all ptes in the block return the same access/dirty info). When changing the access/dirty info on a pte (e.g. ptep_set_access_flags()) that is part of a contpte mapping, this change will affect the whole contpte block. This is works fine in practice since we guarantee that only a single folio is mapped by a contpte block, and the core-mm tracks access/dirty information per folio. In order for the public functions, which used to be pure inline, to continue to be callable by modules, export all the contpte_* symbols that are now called by those public inline functions. The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter at build time. It defaults to enabled as long as its dependency, TRANSPARENT_HUGEPAGE is also enabled. The core-mm depends upon TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not enabled, then there is no chance of meeting the physical contiguity requirement for contpte mappings. Link: https://lkml.kernel.org/r/20240215103205.2607016-13-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Ard Biesheuvel Tested-by: John Hubbard Acked-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/efi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index c74f47711f0b..57da15e7429c 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -692,6 +692,11 @@ extern struct efi { extern struct mm_struct efi_mm; +static inline bool mm_is_efi(struct mm_struct *mm) +{ + return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm; +} + static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) { -- cgit v1.2.3 From c6ec76a2ebc5829e5826b218d2e1475ec11b333e Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:02 +0000 Subject: mm: add pte_batch_hint() to reduce scanning in folio_pte_batch() Some architectures (e.g. arm64) can tell from looking at a pte, if some follow-on ptes also map contiguous physical memory with the same pgprot. (for arm64, these are contpte mappings). Take advantage of this knowledge to optimize folio_pte_batch() so that it can skip these ptes when scanning to create a batch. By default, if an arch does not opt-in, folio_pte_batch() returns a compile-time 1, so the changes are optimized out and the behaviour is as before. arm64 will opt-in to providing this hint in the next patch, which will greatly reduce the cost of ptep_get() when scanning a range of contptes. Link: https://lkml.kernel.org/r/20240215103205.2607016-16-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Tested-by: John Hubbard Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index bc005d84f764..a36cf4e124b0 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,6 +212,27 @@ static inline int pmd_dirty(pmd_t pmd) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif +#ifndef pte_batch_hint +/** + * pte_batch_hint - Number of pages that can be added to batch without scanning. + * @ptep: Page table pointer for the entry. + * @pte: Page table entry. + * + * Some architectures know that a set of contiguous ptes all map the same + * contiguous memory with the same permissions. In this case, it can provide a + * hint to aid pte batching without the core code needing to scan every pte. + * + * An architecture implementation may ignore the PTE accessed state. Further, + * the dirty state must apply atomically to all the PTEs described by the hint. + * + * May be overridden by the architecture, else pte_batch_hint is always 1. + */ +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + return 1; +} +#endif + #ifndef pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { -- cgit v1.2.3 From 2807c54b3809ca5e51e5a9dfa5e9ddb8993a0e6f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:25 -0500 Subject: dax: add empty static inline for CONFIG_DAX=n Patch series "Introduce cpu_dcache_is_aliasing() to fix DAX regression", v6. This commit introduced in v4.0 prevents building FS_DAX on 32-bit ARM, even on ARMv7 which does not have virtually aliased data caches: commit d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Even though it used to work fine before. The root of the issue here is the fact that DAX was never designed to handle virtually aliasing data caches (VIVT and VIPT with aliasing data cache). It touches the pages through their linear mapping, which is not consistent with the userspace mappings with virtually aliasing data caches. This patch series introduces cpu_dcache_is_aliasing() with the new Kconfig option ARCH_HAS_CPU_CACHE_ALIASING and implements it for all architectures. The implementation of cpu_dcache_is_aliasing() is either evaluated to a constant at compile-time or a runtime check, which is what is needed on ARM. With this we can basically narrow down the list of architectures which are unsupported by DAX to those which are really affected. This patch (of 9): When building a kernel with CONFIG_DAX=n, all uses of set_dax_nocache() and set_dax_nomc() need to be either within regions of code or compile units which are explicitly not compiled, or they need to rely on compiler optimizations to eliminate calls to those undefined symbols. It appears that at least the openrisc and loongarch architectures don't end up eliminating those undefined symbols even if they are provably within code which is eliminated due to conditional branches depending on constants. Implement empty static inline functions for set_dax_nocache() and set_dax_nomc() in CONFIG_DAX=n to ensure those undefined references are removed. Link: https://lkml.kernel.org/r/20240215144633.96437-1-mathieu.desnoyers@efficios.com Link: https://lkml.kernel.org/r/20240215144633.96437-2-mathieu.desnoyers@efficios.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402140037.wGfA1kqX-lkp@intel.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402131351.a0FZOgEG-lkp@intel.com/ Fixes: 7ac5360cd4d0 ("dax: remove the copy_from_iter and copy_to_iter methods") Signed-off-by: Mathieu Desnoyers Cc: Christoph Hellwig Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Dave Chinner Cc: Michael Sclafani Cc: Alasdair Kergon Cc: Heiko Carstens Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Andrew Morton --- include/linux/dax.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index b463502b16e1..e3ffe7c7f01d 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -63,6 +63,8 @@ void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); bool dax_write_cache_enabled(struct dax_device *dax_dev); bool dax_synchronous(struct dax_device *dax_dev); +void set_dax_nocache(struct dax_device *dax_dev); +void set_dax_nomc(struct dax_device *dax_dev); void set_dax_synchronous(struct dax_device *dax_dev); size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); @@ -109,6 +111,12 @@ static inline bool dax_synchronous(struct dax_device *dax_dev) { return true; } +static inline void set_dax_nocache(struct dax_device *dax_dev) +{ +} +static inline void set_dax_nomc(struct dax_device *dax_dev) +{ +} static inline void set_dax_synchronous(struct dax_device *dax_dev) { } @@ -124,9 +132,6 @@ static inline size_t dax_recovery_write(struct dax_device *dax_dev, } #endif -void set_dax_nocache(struct dax_device *dax_dev); -void set_dax_nomc(struct dax_device *dax_dev); - struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); -- cgit v1.2.3 From 6d439c18d9b190ab1e0f1196bc45590f95752bf1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:26 -0500 Subject: dax: alloc_dax() return ERR_PTR(-EOPNOTSUPP) for CONFIG_DAX=n Change the return value from NULL to PTR_ERR(-EOPNOTSUPP) for CONFIG_DAX=n to be consistent with the fact that CONFIG_DAX=y never returns NULL. This is done in preparation for using cpu_dcache_is_aliasing() in a following change which will properly support architectures which detect data cache aliasing at runtime. Link: https://lkml.kernel.org/r/20240215144633.96437-3-mathieu.desnoyers@efficios.com Fixes: 4e4ced93794a ("dax: Move mandatory ->zero_page_range() check in alloc_dax()") Signed-off-by: Mathieu Desnoyers Reviewed-by: Dan Williams Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Alasdair Kergon Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Andrew Morton --- include/linux/dax.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index e3ffe7c7f01d..9d3e3327af4c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -88,11 +88,7 @@ static inline void *dax_holder(struct dax_device *dax_dev) static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { - /* - * Callers should check IS_ENABLED(CONFIG_DAX) to know if this - * NULL is an error or expected. - */ - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static inline void put_dax(struct dax_device *dax_dev) { -- cgit v1.2.3 From 8690bbcf3b7010b31fdbf3851e1add6ae19b8624 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 15 Feb 2024 09:46:32 -0500 Subject: Introduce cpu_dcache_is_aliasing() across all architectures Introduce a generic way to query whether the data cache is virtually aliased on all architectures. Its purpose is to ensure that subsystems which are incompatible with virtually aliased data caches (e.g. FS_DAX) can reliably query this. For data cache aliasing, there are three scenarios dependending on the architecture. Here is a breakdown based on my understanding: A) The data cache is always aliasing: * arc * csky * m68k (note: shared memory mappings are incoherent ? SHMLBA is missing there.) * sh * parisc B) The data cache aliasing is statically known or depends on querying CPU state at runtime: * arm (cache_is_vivt() || cache_is_vipt_aliasing()) * mips (cpu_has_dc_aliases) * nios2 (NIOS2_DCACHE_SIZE > PAGE_SIZE) * sparc32 (vac_cache_size > PAGE_SIZE) * sparc64 (L1DCACHE_SIZE > PAGE_SIZE) * xtensa (DCACHE_WAY_SIZE > PAGE_SIZE) C) The data cache is never aliasing: * alpha * arm64 (aarch64) * hexagon * loongarch (but with incoherent write buffers, which are disabled since commit d23b7795 ("LoongArch: Change SHMLBA from SZ_64K to PAGE_SIZE")) * microblaze * openrisc * powerpc * riscv * s390 * um * x86 Require architectures in A) and B) to select ARCH_HAS_CPU_CACHE_ALIASING and implement "cpu_dcache_is_aliasing()". Architectures in C) don't select ARCH_HAS_CPU_CACHE_ALIASING, and thus cpu_dcache_is_aliasing() simply evaluates to "false". Note that this leaves "cpu_icache_is_aliasing()" to be implemented as future work. This would be useful to gate features like XIP on architectures which have aliasing CPU dcache-icache but not CPU dcache-dcache. Use "cpu_dcache" and "cpu_cache" rather than just "dcache" and "cache" to clarify that we really mean "CPU data cache" and "CPU cache" to eliminate any possible confusion with VFS "dentry cache" and "page cache". Link: https://lore.kernel.org/lkml/20030910210416.GA24258@mail.jlokier.co.uk/ Link: https://lkml.kernel.org/r/20240215144633.96437-9-mathieu.desnoyers@efficios.com Fixes: d92576f1167c ("dax: does not work correctly with virtual aliasing caches") Signed-off-by: Mathieu Desnoyers Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Russell King Cc: Alasdair Kergon Cc: Christoph Hellwig Cc: Dave Chinner Cc: Heiko Carstens Cc: kernel test robot Cc: Michael Sclafani Cc: Mike Snitzer Cc: Mikulas Patocka Signed-off-by: Andrew Morton --- include/linux/cacheinfo.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index d504eb4b49ab..2cb15fe4fe12 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -138,4 +138,10 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #define use_arch_cache_info() (false) #endif +#ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING +#define cpu_dcache_is_aliasing() false +#else +#include +#endif + #endif /* _LINUX_CACHEINFO_H */ -- cgit v1.2.3 From f91e6b41dd11daffb138e3afdb4804aefc3d4e1b Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:53 -0800 Subject: userfaultfd: move userfaultfd_ctx struct to header file Patch series "per-vma locks in userfaultfd", v7. Performing userfaultfd operations (like copy/move etc.) in critical section of mmap_lock (read-mode) causes significant contention on the lock when operations requiring the lock in write-mode are taking place concurrently. We can use per-vma locks instead to significantly reduce the contention issue. Android runtime's Garbage Collector uses userfaultfd for concurrent compaction. mmap-lock contention during compaction potentially causes jittery experience for the user. During one such reproducible scenario, we observed the following improvements with this patch-set: - Wall clock time of compaction phase came down from ~3s to <500ms - Uninterruptible sleep time (across all threads in the process) was ~10ms (none in mmap_lock) during compaction, instead of >20s This patch (of 4): Move the struct to userfaultfd_k.h to be accessible from mm/userfaultfd.c. There are no other changes in the struct. This is required to prepare for using per-vma locks in userfaultfd operations. Link: https://lkml.kernel.org/r/20240215182756.3448972-1-lokeshgidra@google.com Link: https://lkml.kernel.org/r/20240215182756.3448972-2-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e4056547fbe6..691d928ee864 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -36,6 +36,45 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) +/* + * Start with fault_pending_wqh and fault_wqh so they're more likely + * to be in the same cacheline. + * + * Locking order: + * fd_wqh.lock + * fault_pending_wqh.lock + * fault_wqh.lock + * event_wqh.lock + * + * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, + * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's + * also taken in IRQ context. + */ +struct userfaultfd_ctx { + /* waitqueue head for the pending (i.e. not read) userfaults */ + wait_queue_head_t fault_pending_wqh; + /* waitqueue head for the userfaults */ + wait_queue_head_t fault_wqh; + /* waitqueue head for the pseudo fd to wakeup poll/read */ + wait_queue_head_t fd_wqh; + /* waitqueue head for events */ + wait_queue_head_t event_wqh; + /* a refile sequence protected by fault_pending_wqh lock */ + seqcount_spinlock_t refile_seq; + /* pseudo fd refcounting */ + refcount_t refcount; + /* userfaultfd syscall flags */ + unsigned int flags; + /* features requested from the userspace */ + unsigned int features; + /* released */ + bool released; + /* memory mappings are changing because of non-cooperative event */ + atomic_t mmap_changing; + /* mm with one ore more vmas attached to this userfaultfd_ctx */ + struct mm_struct *mm; +}; + extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); /* A combined operation mode + behavior flags. */ -- cgit v1.2.3 From 5e4c24a57b0c126686534b5b159a406c5dd02400 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:54 -0800 Subject: userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx Increments and loads to mmap_changing are always in mmap_lock critical section. This ensures that if userspace requests event notification for non-cooperative operations (e.g. mremap), userfaultfd operations don't occur concurrently. This can be achieved by using a separate read-write semaphore in userfaultfd_ctx such that increments are done in write-mode and loads in read-mode, thereby eliminating the dependency on mmap_lock for this purpose. This is a preparatory step before we replace mmap_lock usage with per-vma locks in fill/move ioctls. Link: https://lkml.kernel.org/r/20240215182756.3448972-3-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 691d928ee864..3210c3552976 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -69,6 +69,13 @@ struct userfaultfd_ctx { unsigned int features; /* released */ bool released; + /* + * Prevents userfaultfd operations (fill/move/wp) from happening while + * some non-cooperative event(s) is taking place. Increments are done + * in write-mode. Whereas, userfaultfd operations, which includes + * reading mmap_changing, is done under read-mode. + */ + struct rw_semaphore map_changing_lock; /* memory mappings are changing because of non-cooperative event */ atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ @@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd, unsigned long dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags); -extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, +extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, uffd_flags_t flags); -extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, + uffd_flags_t flags); +extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long dst_start, - unsigned long len, - atomic_t *mmap_changing); -extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags); -extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing, - uffd_flags_t flags); -extern int mwriteprotect_range(struct mm_struct *dst_mm, - unsigned long start, unsigned long len, - bool enable_wp, atomic_t *mmap_changing); + unsigned long len); +extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long len, uffd_flags_t flags); +extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags); +extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, bool enable_wp); extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); -- cgit v1.2.3 From 32af81af2f6f4c23b1b4ff68410e91da660af102 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:55 -0800 Subject: mm: add vma_assert_locked() for !CONFIG_PER_VMA_LOCK vma_assert_locked() is needed to replace mmap_assert_locked() once we start using per-vma locks in userfaultfd operations. In !CONFIG_PER_VMA_LOCK case when mm is locked, it implies that the given VMA is locked. Link: https://lkml.kernel.org/r/20240215182756.3448972-4-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Suren Baghdasaryan Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Tim Murray Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ac6b71cbdffb..6f4825d82965 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -781,6 +781,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, return NULL; } +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); +} + static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); -- cgit v1.2.3 From 867a43a34ff8a38772212045262b2c9b77807ea3 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:56 -0800 Subject: userfaultfd: use per-vma locks in userfaultfd operations All userfaultfd operations, except write-protect, opportunistically use per-vma locks to lock vmas. On failure, attempt again inside mmap_lock critical section. Write-protect operation requires mmap_lock as it iterates over multiple vmas. Link: https://lkml.kernel.org/r/20240215182756.3448972-5-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3210c3552976..05d59f74fc88 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -138,9 +138,8 @@ extern long uffd_wp_range(struct vm_area_struct *vma, /* move_pages */ void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); -ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, - unsigned long dst_start, unsigned long src_start, - unsigned long len, __u64 flags); +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 flags); int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, -- cgit v1.2.3 From fafdea34194a10deefc0a0f1dace4280079ce0e7 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 9 Jan 2024 15:16:30 -0700 Subject: arch and include: update LLVM Phabricator links reviews.llvm.org was LLVM's Phabricator instances for code review. It has been abandoned in favor of GitHub pull requests. While the majority of links in the kernel sources still work because of the work Fangrui has done turning the dynamic Phabricator instance into a static archive, there are some issues with that work, so preemptively convert all the links in the kernel sources to point to the commit on GitHub. Most of the commits have the corresponding differential review link in the commit message itself so there should not be any loss of fidelity in the relevant information. Link: https://discourse.llvm.org/t/update-on-github-pull-requests/71540/172 Link: https://lkml.kernel.org/r/20240109-update-llvm-links-v1-2-eb09b59db071@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Conor Dooley Reviewed-by: Kees Cook Acked-by: Fangrui Song Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Daniel Borkmann Cc: Mykola Lysenko Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index ddab1ef22bee..f0a47afef125 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -9,7 +9,7 @@ * Clang prior to 17 is being silly and considers many __cleanup() variables * as unused (because they are, their sole purpose is to go out of scope). * - * https://reviews.llvm.org/D152180 + * https://github.com/llvm/llvm-project/commit/877210faa447f4cc7db87812f8ed80e398fedd61 */ #undef __cleanup #define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func))) -- cgit v1.2.3 From a43c47561e46cbefc6a2fe2c3f8f6bda0e553a83 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 4 Jan 2024 17:49:33 +0100 Subject: list: add hlist_count_nodes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a generic hlist_count_nodes() function and use it in two drivers. This patch (of 3): Add a function to count nodes in a hlist. hlist_count_nodes() is similar to list_count_nodes(). Link: https://lkml.kernel.org/r/20240104164937.424320-1-pierre.gondois@arm.com Link: https://lkml.kernel.org/r/20240104164937.424320-2-pierre.gondois@arm.com Signed-off-by: Pierre Gondois Reviewed-by: Carlos Llamas Acked-by: Coly Li Acked-by: Marco Elver Reviewed-by: Andy Shevchenko Cc: Arve Hjønnevåg Cc: Christian Brauner Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jani Nikula Cc: Joel Fernandes (Google) Cc: Kees Cook Cc: Kent Overstreet Cc: Martijn Coenen Cc: Suren Baghdasaryan Cc: Todd Kjos Signed-off-by: Andrew Morton --- include/linux/list.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 059aa1fff41e..523b7c4d000a 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -1195,4 +1195,19 @@ static inline void hlist_splice_init(struct hlist_head *from, pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) +/** + * hlist_count_nodes - count nodes in the hlist + * @head: the head for your hlist. + */ +static inline size_t hlist_count_nodes(struct hlist_head *head) +{ + struct hlist_node *pos; + size_t count = 0; + + hlist_for_each(pos, head) + count++; + + return count; +} + #endif -- cgit v1.2.3 From 3911fb647b65f11d42bc97e0890bba8ef7e1e0e6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 7 Jan 2024 14:01:55 -0800 Subject: lib/win_minmax: fix header comments Don't use "/**" kernel-doc comment marker for non-kernel-doc comment. Correct the filename but omit the path since we know where it is and it could change (but not likely). Link: https://lkml.kernel.org/r/20240107220155.29013-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/win_minmax.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h index 4ca2842d2842..6a5bb052fcc2 100644 --- a/include/linux/win_minmax.h +++ b/include/linux/win_minmax.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/** - * lib/minmax.c: windowed min/max tracker by Kathleen Nichols. +/* + * win_minmax.h: windowed min/max tracker by Kathleen Nichols. * */ #ifndef MINMAX_H -- cgit v1.2.3 From c499c717ee7cc07f47d7ee38a1791a58dcf1d4eb Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Wed, 10 Jan 2024 16:12:12 +0800 Subject: lib min_heap: optimize number of calls to min_heapify() Patch series "lib min_heap: Min heap optimizations". The purpose of this patch series is to enhance the existing min heap implementation. The optimization focuses on both the heap construction process and the number of comparisons made during the heapify operation. This patch (of 2): Improve the heap construction process by reducing unnecessary heapify operations. Specifically, adjust the starting condition from n / 2 to n / 2 - 1 in the loop that iterates over all non-leaf elements. Link: https://lkml.kernel.org/r/20240110081213.2289636-1-visitorckw@gmail.com Link: https://lkml.kernel.org/r/20240110081213.2289636-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 44077837385f..18a581310eb3 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -70,7 +70,7 @@ void min_heapify_all(struct min_heap *heap, { int i; - for (i = heap->nr / 2; i >= 0; i--) + for (i = heap->nr / 2 - 1; i >= 0; i--) min_heapify(heap, i, func); } -- cgit v1.2.3 From c641722e0c944e423572dd6222c677678d793ed5 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Wed, 10 Jan 2024 16:12:13 +0800 Subject: lib min_heap: optimize number of comparisons in min_heapify() Optimize the min_heapify() function, resulting in a significant reduction of approximately 50% in the number of comparisons for large random inputs, while maintaining identical results. The current implementation performs two comparisons per level to identify the minimum among three elements. In contrast, the proposed bottom-up variation uses only one comparison per level to assess two children until reaching the leaves. Then, it sifts up until the correct position is determined. Typically, the process of sifting down proceeds to the leaf level, resulting in O(1) secondary comparisons instead of log2(n). This optimization significantly reduces the number of costly indirect function calls and improves overall performance. Link: https://lkml.kernel.org/r/20240110081213.2289636-3-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 18a581310eb3..d52daf45861b 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -35,31 +35,33 @@ static __always_inline void min_heapify(struct min_heap *heap, int pos, const struct min_heap_callbacks *func) { - void *left, *right, *parent, *smallest; + void *left, *right; void *data = heap->data; + void *root = data + pos * func->elem_size; + int i = pos, j; + /* Find the sift-down path all the way to the leaves. */ for (;;) { - if (pos * 2 + 1 >= heap->nr) + if (i * 2 + 2 >= heap->nr) break; + left = data + (i * 2 + 1) * func->elem_size; + right = data + (i * 2 + 2) * func->elem_size; + i = func->less(left, right) ? i * 2 + 1 : i * 2 + 2; + } - left = data + ((pos * 2 + 1) * func->elem_size); - parent = data + (pos * func->elem_size); - smallest = parent; - if (func->less(left, smallest)) - smallest = left; - - if (pos * 2 + 2 < heap->nr) { - right = data + ((pos * 2 + 2) * func->elem_size); - if (func->less(right, smallest)) - smallest = right; - } - if (smallest == parent) - break; - func->swp(smallest, parent); - if (smallest == left) - pos = (pos * 2) + 1; - else - pos = (pos * 2) + 2; + /* Special case for the last leaf with no sibling. */ + if (i * 2 + 2 == heap->nr) + i = i * 2 + 1; + + /* Backtrack to the correct location. */ + while (i != pos && func->less(root, data + i * func->elem_size)) + i = (i - 1) / 2; + + /* Shift the element into its correct place. */ + j = i; + while (i != pos) { + i = (i - 1) / 2; + func->swp(data + i * func->elem_size, data + j * func->elem_size); } } -- cgit v1.2.3 From d6bbab8f352efb0533d3fa4af09bb60da770ecc5 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 19 Jan 2024 04:13:21 +0800 Subject: flex_proportions: remove unused fprop_local_single The single variant of flex_proportions is not used. Simply remove it. Link: https://lkml.kernel.org/r/20240118201321.759174-1-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/flex_proportions.h | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index 3e378b1fb0bc..e9a72fd0bfe7 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -38,38 +38,6 @@ int fprop_global_init(struct fprop_global *p, gfp_t gfp); void fprop_global_destroy(struct fprop_global *p); bool fprop_new_period(struct fprop_global *p, int periods); -/* - * ---- SINGLE ---- - */ -struct fprop_local_single { - /* the local events counter */ - unsigned long events; - /* Period in which we last updated events */ - unsigned int period; - raw_spinlock_t lock; /* Protect period and numerator */ -}; - -#define INIT_FPROP_LOCAL_SINGLE(name) \ -{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ -} - -int fprop_local_init_single(struct fprop_local_single *pl); -void fprop_local_destroy_single(struct fprop_local_single *pl); -void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl); -void fprop_fraction_single(struct fprop_global *p, - struct fprop_local_single *pl, unsigned long *numerator, - unsigned long *denominator); - -static inline -void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl) -{ - unsigned long flags; - - local_irq_save(flags); - __fprop_inc_single(p, pl); - local_irq_restore(flags); -} - /* * ---- PERCPU ---- */ -- cgit v1.2.3 From e5efd80a9a7688a26ef34b9c1d3801c25abdf350 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Jan 2024 15:55:17 -0700 Subject: compiler-clang.h: update __diag_clang() macros for minimum version bump The minimum supported version of LLVM for building the kernel has been bumped to 13.0.1. Update the __diag_clang() macros for this bump. Link: https://lkml.kernel.org/r/20240125-bump-min-llvm-ver-to-13-0-1-v1-11-f5ff9bda41c5@kernel.org Signed-off-by: Nathan Chancellor Reviewed-by: Kees Cook Cc: Albert Ou Cc: "Aneesh Kumar K.V (IBM)" Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Conor Dooley Cc: Dave Hansen Cc: Ingo Molnar Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Nicolas Schier Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index f0a47afef125..49feac0162a5 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -114,11 +114,7 @@ #define __diag_str(s) __diag_str1(s) #define __diag(s) _Pragma(__diag_str(clang diagnostic s)) -#if CONFIG_CLANG_VERSION >= 110000 -#define __diag_clang_11(s) __diag(s) -#else -#define __diag_clang_11(s) -#endif +#define __diag_clang_13(s) __diag(s) #define __diag_ignore_all(option, comment) \ - __diag_clang(11, ignore, option) + __diag_clang(13, ignore, option) -- cgit v1.2.3 From ac4db926e17a669c788efc89b81a4a0f40648445 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 24 Jan 2024 15:27:35 +0100 Subject: init: remove obsolete arch_call_rest_init() wrapper Since commit 3570ee046c46b5dc ("s390/smp: keep the original lowcore for CPU 0"), there is no longer any architecture that needs to override arch_call_rest_init(). Remove the weak wrapper around rest_init(), call rest_init() directly, and make rest_init() static. Link: https://lkml.kernel.org/r/aa10868bfb176eef4abb8bb4a710b85330792694.1706106183.git.geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Ilya Leoshkevich Cc: Josh Poimboeuf Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/start_kernel.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/start_kernel.h b/include/linux/start_kernel.h index a9806a44a605..09f994ac87df 100644 --- a/include/linux/start_kernel.h +++ b/include/linux/start_kernel.h @@ -9,7 +9,5 @@ up something else. */ extern asmlinkage void __init __noreturn start_kernel(void); -extern void __init __noreturn arch_call_rest_init(void); -extern void __ref __noreturn rest_init(void); #endif /* _LINUX_START_KERNEL_H */ -- cgit v1.2.3 From 022b973a3de9b45148e4ae2da1480ec9e61e74a2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Feb 2024 12:39:30 +0300 Subject: smp: make __smp_processor_id() 0-argument macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit smp_processor_id family of macros never accepted any arguments. #define __smp_processor_id(x) works by accident (see C99 6.10.3 §4). __smp_processor_id() gets 1 (empty) argument and passes it down to raw_smp_processor_id() which doesn't accept arguments. Link: https://lkml.kernel.org/r/0037d1f2-8153-4b33-b43e-f4b6ecd710ac@p183 Signed-off-by: Alexey Dobriyan Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton --- include/linux/smp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index e87520dc2959..cc517002c599 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -261,7 +261,7 @@ static inline int get_boot_cpu_id(void) * regular asm read for the stable. */ #ifndef __smp_processor_id -#define __smp_processor_id(x) raw_smp_processor_id(x) +#define __smp_processor_id() raw_smp_processor_id() #endif #ifdef CONFIG_DEBUG_PREEMPT -- cgit v1.2.3 From 2932fb0a927d30690b8cb70c71d511fd9054bb61 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 8 Feb 2024 02:14:23 +0000 Subject: list: leverage list_is_head() for list_entry_is_head() This is what list_is_head() exactly do. Link: https://lkml.kernel.org/r/20240208021423.15704-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Andy Shevchenko Signed-off-by: Andrew Morton --- include/linux/list.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 523b7c4d000a..5f4b0a39cf46 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -766,7 +766,7 @@ static inline size_t list_count_nodes(struct list_head *head) * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ - (&pos->member == (head)) + list_is_head(&pos->member, (head)) /** * list_for_each_entry - iterate over list of given type -- cgit v1.2.3 From 77bcd9e6231a5297ef417a7d7f734d61c2bcceb6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Jan 2024 16:39:35 -0800 Subject: KVM: Add dedicated arch hook for querying if vCPU was preempted in-kernel Plumb in a dedicated hook for querying whether or not a vCPU was preempted in-kernel. Unlike literally every other architecture, x86's VMX can check if a vCPU is in kernel context if and only if the vCPU is loaded on the current pCPU. x86's kvm_arch_vcpu_in_kernel() works around the limitation by querying kvm_get_running_vcpu() and redirecting to vcpu->arch.preempted_in_kernel as needed. But that's unnecessary, confusing, and fragile, e.g. x86 has had at least one bug where KVM incorrectly used a stale preempted_in_kernel. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20240110003938.490206-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e7fd25b09b3..28b020404a41 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1505,6 +1505,7 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); int kvm_arch_create_vm_debugfs(struct kvm *kvm); -- cgit v1.2.3 From aa3c88990f77bb9acb3d445337bc088031ac63f9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 22 Feb 2024 12:41:00 -0800 Subject: sysfs: Document new "group visible" helpers Add documentation and examples for how to use DEFINE_SYSFS_GROUP_VISIBLE() and SYSFS_GROUP_VISIBLE(). Recall that the motivation for this work is that it is easier to reason about the lifetime of statically defined sysfs attributes that become visible at device_add() time rather than dynamically adding them later. DEFINE_SYSFS_GROUP_VISIBLE() tackles one of the reasons to opt for dynamically created attributes which did not have a facility for hiding empty directories. Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/170863446065.1479840.10697164014098377292.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index a42642b277dd..dabf7f4f3581 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -105,8 +105,42 @@ struct attribute_group { #define SYSFS_GROUP_INVISIBLE 020000 /* - * The first call to is_visible() in the create / update path may - * indicate visibility for the entire group + * DEFINE_SYSFS_GROUP_VISIBLE(name): + * A helper macro to pair with the assignment of ".is_visible = + * SYSFS_GROUP_VISIBLE(name)", that arranges for the directory + * associated with a named attribute_group to optionally be hidden. + * This allows for static declaration of attribute_groups, and the + * simplification of attribute visibility lifetime that implies, + * without polluting sysfs with empty attribute directories. + * Ex. + * + * static umode_t example_attr_visible(struct kobject *kobj, + * struct attribute *attr, int n) + * { + * if (example_attr_condition) + * return 0; + * else if (ro_attr_condition) + * return 0444; + * return a->mode; + * } + * + * static bool example_group_visible(struct kobject *kobj) + * { + * if (example_group_condition) + * return false; + * return true; + * } + * + * DEFINE_SYSFS_GROUP_VISIBLE(example); + * + * static struct attribute_group example_group = { + * .name = "example", + * .is_visible = SYSFS_GROUP_VISIBLE(example), + * .attrs = &example_attrs, + * }; + * + * Note that it expects _attr_visible and _group_visible to + * be defined. */ #define DEFINE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ @@ -119,7 +153,9 @@ struct attribute_group { /* * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary - * attributes + * attributes. If an attribute_group defines both text and binary + * attributes, the group visibility is determined by the function + * specified to is_visible() not is_bin_visible() */ #define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ -- cgit v1.2.3 From 04edfa7fa059ba50d3236b55ba0ae23b1721e868 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 22 Feb 2024 12:41:06 -0800 Subject: sysfs: Introduce DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE() One of the first users of DEFINE_SYSFS_GROUP_VISIBLE() did this: static umode_t dp0_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct sdw_slave *slave = dev_to_sdw_dev(kobj_to_dev(kobj)); if (slave->prop.dp0_prop) return attr->mode; return 0; } static bool dp0_group_visible(struct kobject *kobj) { struct sdw_slave *slave = dev_to_sdw_dev(kobj_to_dev(kobj)); if (slave->prop.dp0_prop) return true; return false; } DEFINE_SYSFS_GROUP_VISIBLE(dp0); ...i.e. the _group_visible() helper is identical to the _attr_visible() helper. Use the "simple" helper to reduce that to: static bool dp0_group_visible(struct kobject *kobj) { struct sdw_slave *slave = dev_to_sdw_dev(kobj_to_dev(kobj)); if (slave->prop.dp0_prop) return true; return false; } DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(dp0); Remove the need to specify per attribute visibility if the goal is to hide the entire group. Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/170863446625.1479840.10593839479268727913.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index dabf7f4f3581..326341c62385 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -140,7 +140,9 @@ struct attribute_group { * }; * * Note that it expects _attr_visible and _group_visible to - * be defined. + * be defined. For cases where individual attributes do not need + * separate visibility consideration, only entire group visibility at + * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(). */ #define DEFINE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ @@ -151,6 +153,38 @@ struct attribute_group { return name##_attr_visible(kobj, attr, n); \ } +/* + * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name): + * A helper macro to pair with SYSFS_GROUP_VISIBLE() that like + * DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does + * not require the implementation of a per-attribute visibility + * callback. + * Ex. + * + * static bool example_group_visible(struct kobject *kobj) + * { + * if (example_group_condition) + * return false; + * return true; + * } + * + * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example); + * + * static struct attribute_group example_group = { + * .name = "example", + * .is_visible = SYSFS_GROUP_VISIBLE(example), + * .attrs = &example_attrs, + * }; + */ +#define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct attribute *a, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return a->mode; \ + } + /* * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary * attributes. If an attribute_group defines both text and binary @@ -166,6 +200,15 @@ struct attribute_group { return name##_attr_visible(kobj, attr, n); \ } +#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name) \ + static inline umode_t sysfs_group_visible_##name( \ + struct kobject *kobj, struct bin_attribute *a, int n) \ + { \ + if (n == 0 && !name##_group_visible(kobj)) \ + return SYSFS_GROUP_INVISIBLE; \ + return a->mode; \ + } + #define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn /* -- cgit v1.2.3 From f4cc33e78ba8624a79ba8dea98ce5c85aa9ca33c Mon Sep 17 00:00:00 2001 From: Yu Chien Peter Lin Date: Thu, 22 Feb 2024 16:39:39 +0800 Subject: irqchip/riscv-intc: Introduce Andes hart-level interrupt controller Add support for the Andes hart-level interrupt controller. This controller provides interrupt mask/unmask functions to access the custom register (SLIE) where the non-standard S-mode local interrupt enable bits are located. The base of custom interrupt number is set to 256. To share the riscv_intc_domain_map() with the generic RISC-V INTC and ACPI, add a chip parameter to riscv_intc_init_common(), so it can be passed to the irq_domain_set_info() as a private data. Andes hart-level interrupt controller requires the "andestech,cpu-intc" compatible string to be present in interrupt-controller of cpu node to enable the use of custom local interrupt source. e.g., cpu0: cpu@0 { compatible = "andestech,ax45mp", "riscv"; ... cpu0-intc: interrupt-controller { #interrupt-cells = <0x01>; compatible = "andestech,cpu-intc", "riscv,cpu-intc"; interrupt-controller; }; }; Signed-off-by: Yu Chien Peter Lin Signed-off-by: Thomas Gleixner Reviewed-by: Randolph Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20240222083946.3977135-4-peterlin@andestech.com --- include/linux/soc/andes/irq.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 include/linux/soc/andes/irq.h (limited to 'include/linux') diff --git a/include/linux/soc/andes/irq.h b/include/linux/soc/andes/irq.h new file mode 100644 index 000000000000..edc3182d6e66 --- /dev/null +++ b/include/linux/soc/andes/irq.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 Andes Technology Corporation + */ +#ifndef __ANDES_IRQ_H +#define __ANDES_IRQ_H + +/* Andes PMU irq number */ +#define ANDES_RV_IRQ_PMOVI 18 +#define ANDES_RV_IRQ_LAST ANDES_RV_IRQ_PMOVI +#define ANDES_SLI_CAUSE_BASE 256 + +/* Andes PMU related registers */ +#define ANDES_CSR_SLIE 0x9c4 +#define ANDES_CSR_SLIP 0x9c5 +#define ANDES_CSR_SCOUNTEROF 0x9d4 + +#endif /* __ANDES_IRQ_H */ -- cgit v1.2.3 From 00ca8a15dafa990d391abc37f2b8256ddf909b35 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 17 Feb 2024 10:39:37 +0100 Subject: phy: constify of_phandle_args in xlate The xlate callbacks are supposed to translate of_phandle_args to proper provider without modifying the of_phandle_args. Make the argument pointer to const for code safety and readability. Signed-off-by: Krzysztof Kozlowski Acked-by: Thierry Reding Acked-by: Linus Walleij Acked-by: Florian Fainelli #Broadcom Link: https://lore.kernel.org/r/20240217093937.58234-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- include/linux/phy/phy.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index f6d607ef0e80..58be86e6fe83 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -176,7 +176,7 @@ struct phy_provider { struct module *owner; struct list_head list; struct phy * (*of_xlate)(struct device *dev, - struct of_phandle_args *args); + const struct of_phandle_args *args); }; /** @@ -265,7 +265,7 @@ void phy_put(struct device *dev, struct phy *phy); void devm_phy_put(struct device *dev, struct phy *phy); struct phy *of_phy_get(struct device_node *np, const char *con_id); struct phy *of_phy_simple_xlate(struct device *dev, - struct of_phandle_args *args); + const struct of_phandle_args *args); struct phy *phy_create(struct device *dev, struct device_node *node, const struct phy_ops *ops); struct phy *devm_phy_create(struct device *dev, struct device_node *node, @@ -275,11 +275,11 @@ void devm_phy_destroy(struct device *dev, struct phy *phy); struct phy_provider *__of_phy_provider_register(struct device *dev, struct device_node *children, struct module *owner, struct phy * (*of_xlate)(struct device *dev, - struct of_phandle_args *args)); + const struct of_phandle_args *args)); struct phy_provider *__devm_of_phy_provider_register(struct device *dev, struct device_node *children, struct module *owner, struct phy * (*of_xlate)(struct device *dev, - struct of_phandle_args *args)); + const struct of_phandle_args *args)); void of_phy_provider_unregister(struct phy_provider *phy_provider); void devm_of_phy_provider_unregister(struct device *dev, struct phy_provider *phy_provider); @@ -479,7 +479,7 @@ static inline struct phy *of_phy_get(struct device_node *np, const char *con_id) } static inline struct phy *of_phy_simple_xlate(struct device *dev, - struct of_phandle_args *args) + const struct of_phandle_args *args) { return ERR_PTR(-ENOSYS); } @@ -509,7 +509,7 @@ static inline void devm_phy_destroy(struct device *dev, struct phy *phy) static inline struct phy_provider *__of_phy_provider_register( struct device *dev, struct device_node *children, struct module *owner, struct phy * (*of_xlate)(struct device *dev, - struct of_phandle_args *args)) + const struct of_phandle_args *args)) { return ERR_PTR(-ENOSYS); } @@ -517,7 +517,7 @@ static inline struct phy_provider *__of_phy_provider_register( static inline struct phy_provider *__devm_of_phy_provider_register(struct device *dev, struct device_node *children, struct module *owner, struct phy * (*of_xlate)(struct device *dev, - struct of_phandle_args *args)) + const struct of_phandle_args *args)) { return ERR_PTR(-ENOSYS); } -- cgit v1.2.3 From dfebe38e46c2e866c6f3ff54d8ed8dabbed193de Mon Sep 17 00:00:00 2001 From: Fuyao Kashizuku Date: Wed, 27 Dec 2023 10:01:17 +0800 Subject: mfd: sun4i-gpadc: Correct specified GPADC interrupt numbers The identifiers are used as IRQ resource numbers, where 0 is treated specially. This fixes sun4i-gpadc-iio probe failed when request irq. The backstack: WARNING: CPU: 3 PID: 1 at drivers/base/platform.c:451 __platform_get_irq_byname+0xb8/0xc4 0 is an invalid IRQ number Modules linked in: CPU: 3 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc6 #9 Hardware name: Allwinner sun8i Family unwind_backtrace show_stack dump_stack_lvl __warn warn_slowpath_fmt __platform_get_irq_byname platform_get_irq_byname sun4i_irq_init sun4i_gpadc_probe platform_probe really_probe __driver_probe_device driver_probe_device __driver_attach bus_for_each_dev bus_add_driver driver_register do_one_initcall do_initcall_level do_initcalls kernel_init_freeable kernel_init Log reports: sun4i-gpadc-iio sun6i-a31-gpadc-iio.0: error -EINVAL: IRQ FIFO_DATA_PENDING not found sun4i-gpadc-iio: probe of sun6i-a31-gpadc-iio.0 failed with error -22 Signed-off-by: Fuyao Kashizuku Acked-by: Jernej Skrabec Link: https://lore.kernel.org/r/ZYuFbUUus9apiCpq@debian.cyg Signed-off-by: Lee Jones --- include/linux/mfd/sun4i-gpadc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/sun4i-gpadc.h b/include/linux/mfd/sun4i-gpadc.h index ea0ccf33a459..021f820f9d52 100644 --- a/include/linux/mfd/sun4i-gpadc.h +++ b/include/linux/mfd/sun4i-gpadc.h @@ -81,8 +81,8 @@ #define SUN4I_GPADC_TEMP_DATA 0x20 #define SUN4I_GPADC_DATA 0x24 -#define SUN4I_GPADC_IRQ_FIFO_DATA 0 -#define SUN4I_GPADC_IRQ_TEMP_DATA 1 +#define SUN4I_GPADC_IRQ_FIFO_DATA 1 +#define SUN4I_GPADC_IRQ_TEMP_DATA 2 /* 10s delay before suspending the IP */ #define SUN4I_GPADC_AUTOSUSPEND_DELAY 10000 -- cgit v1.2.3 From 8b9a1f5ef43b8d26c4df3b4e3cbebec04d7be1c5 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 29 Jan 2024 15:25:53 +0000 Subject: mfd: cs42l43: Tidy up header includes Use more forward declarations, move header guards to cover other includes, and rely less on including headers through other headers. Suggested-by: Andy Shevchenko Signed-off-by: Charles Keepax Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240129152557.3221212-2-ckeepax@opensource.cirrus.com Signed-off-by: Lee Jones --- include/linux/mfd/cs42l43.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/cs42l43.h b/include/linux/mfd/cs42l43.h index cf8263aab41b..2239d8585e78 100644 --- a/include/linux/mfd/cs42l43.h +++ b/include/linux/mfd/cs42l43.h @@ -6,20 +6,21 @@ * Cirrus Logic International Semiconductor Ltd. */ +#ifndef CS42L43_CORE_EXT_H +#define CS42L43_CORE_EXT_H + #include -#include -#include #include #include #include -#include #include -#ifndef CS42L43_CORE_EXT_H -#define CS42L43_CORE_EXT_H - #define CS42L43_N_SUPPLIES 3 +struct device; +struct gpio_desc; +struct sdw_slave; + enum cs42l43_irq_numbers { CS42L43_PLL_LOST_LOCK, CS42L43_PLL_READY, -- cgit v1.2.3 From 65d4418c5002ec5b0e529455bf4152fd43459079 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Feb 2024 10:07:41 -0400 Subject: iommu/sva: Restore SVA handle sharing Prior to commit 092edaddb660 ("iommu: Support mm PASID 1:n with sva domains") the code allowed a SVA handle to be bound multiple times to the same (mm, device) pair. This was alluded to in the kdoc comment, but we had understood this to be more a remark about allowing multiple devices, not a literal same-driver re-opening the same SVA. It turns out uacce and idxd were both relying on the core code to handle reference counting for same-device same-mm scenarios. As this looks hard to resolve in the drivers bring it back to the core code. The new design has changed the meaning of the domain->users refcount to refer to the number of devices that are sharing that domain for the same mm. This is part of the design to lift the SVA domain de-duplication out of the drivers. Return the old behavior by explicitly de-duplicating the struct iommu_sva handle. The same (mm, device) will return the same handle pointer and the core code will handle tracking this. The last unbind of the handle will destroy it. Fixes: 092edaddb660 ("iommu: Support mm PASID 1:n with sva domains") Reported-by: Zhangfei Gao Closes: https://lore.kernel.org/all/20240221110658.529-1-zhangfei.gao@linaro.org/ Tested-by: Zhangfei Gao Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/0-v1-9455fc497a6f+3b4-iommu_sva_sharing_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1ea2a820e1eb..5e27cb3a3be9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -892,11 +892,14 @@ struct iommu_fwspec { struct iommu_sva { struct device *dev; struct iommu_domain *domain; + struct list_head handle_item; + refcount_t users; }; struct iommu_mm_data { u32 pasid; struct list_head sva_domains; + struct list_head sva_handles; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, -- cgit v1.2.3 From 0db017f8edd9b9af818bc1d68ba578df1b4c4628 Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Tue, 20 Feb 2024 11:50:11 +0000 Subject: mfd: syscon: Remove extern from function prototypes The kernel coding style does not require 'extern' in function prototypes in .h files, so remove them as they are not needed. To avoid checkpatch warnings such as CHECK: Lines should not end with a '(' +struct regmap *syscon_regmap_lookup_by_phandle( The indentation is also updated. No functional changes in this patch. Signed-off-by: Peter Griffin Link: https://lore.kernel.org/r/20240220115012.471689-3-peter.griffin@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/syscon.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h index fecc2fa2a364..c315903f6dab 100644 --- a/include/linux/mfd/syscon.h +++ b/include/linux/mfd/syscon.h @@ -17,20 +17,17 @@ struct device_node; #ifdef CONFIG_MFD_SYSCON -extern struct regmap *device_node_to_regmap(struct device_node *np); -extern struct regmap *syscon_node_to_regmap(struct device_node *np); -extern struct regmap *syscon_regmap_lookup_by_compatible(const char *s); -extern struct regmap *syscon_regmap_lookup_by_phandle( - struct device_node *np, - const char *property); -extern struct regmap *syscon_regmap_lookup_by_phandle_args( - struct device_node *np, - const char *property, - int arg_count, - unsigned int *out_args); -extern struct regmap *syscon_regmap_lookup_by_phandle_optional( - struct device_node *np, - const char *property); +struct regmap *device_node_to_regmap(struct device_node *np); +struct regmap *syscon_node_to_regmap(struct device_node *np); +struct regmap *syscon_regmap_lookup_by_compatible(const char *s); +struct regmap *syscon_regmap_lookup_by_phandle(struct device_node *np, + const char *property); +struct regmap *syscon_regmap_lookup_by_phandle_args(struct device_node *np, + const char *property, + int arg_count, + unsigned int *out_args); +struct regmap *syscon_regmap_lookup_by_phandle_optional(struct device_node *np, + const char *property); #else static inline struct regmap *device_node_to_regmap(struct device_node *np) { -- cgit v1.2.3 From ca9414a1d08756c8392f9219caee607e1b7bade1 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Sat, 17 Feb 2024 09:20:04 +0100 Subject: mfd: twl-core: Add power off implementation for twl603x If the system-power-controller property is there, enable power off. Implementation is based on a Linux v3.0 vendor kernel. Signed-off-by: Andreas Kemnade Link: https://lore.kernel.org/r/20240217082007.3238948-3-andreas@kemnade.info Signed-off-by: Lee Jones --- include/linux/mfd/twl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/twl.h b/include/linux/mfd/twl.h index c062d91a67d9..85dc406173db 100644 --- a/include/linux/mfd/twl.h +++ b/include/linux/mfd/twl.h @@ -461,6 +461,7 @@ static inline int twl6030_mmc_card_detect(struct device *dev, int slot) #define TWL4030_PM_MASTER_GLOBAL_TST 0xb6 +#define TWL6030_PHOENIX_DEV_ON 0x06 /*----------------------------------------------------------------------*/ /* Power bus message definitions */ -- cgit v1.2.3 From 9b0a62758665e4d76269bba61eb63e5b8d18e499 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 22 Feb 2024 14:52:01 +0100 Subject: thermal: core: Store zone trips table in struct thermal_zone_device The current code expects thermal zone creators to pass a pointer to a writable trips table to thermal_zone_device_register_with_trips() and that trips table is then used by the thermal core going forward. Consequently, the callers of thermal_zone_device_register_with_trips() are required to hold on to the trips table passed to it until the given thermal zone is unregistered, at which point the trips table can be freed, but at the same time they are not expected to access that table directly. This is both error prone and confusing. To address it, turn the trips table pointer in struct thermal_zone_device into a flex array (counted by its num_trips field), allocate it during thermal zone device allocation and copy the contents of the trips table supplied by the zone creator (which can be const now) into it, which will allow the callers of thermal_zone_device_register_with_trips() to drop their trip tables right after the zone registration. This requires the imx thermal driver to be adjusted to store the new temperature in its internal trips table in imx_set_trip_temp(), because it will be separate from the core's trips table now and it has to be explicitly kept in sync with the latter. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Daniel Lezcano --- include/linux/thermal.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 65d8f92a9a0d..572a24f29a10 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -130,7 +130,6 @@ struct thermal_cooling_device { * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis * @mode: current mode of this thermal zone * @devdata: private pointer for device private data - * @trips: an array of struct thermal_trip * @num_trips: number of trip points the thermal zone supports * @passive_delay_jiffies: number of jiffies to wait between polls when * performing passive cooling. @@ -160,6 +159,7 @@ struct thermal_cooling_device { * @poll_queue: delayed work for polling * @notify_event: Last notification event * @suspended: thermal zone suspend indicator + * @trips: array of struct thermal_trip objects */ struct thermal_zone_device { int id; @@ -172,7 +172,6 @@ struct thermal_zone_device { struct thermal_attr *trip_hyst_attrs; enum thermal_device_mode mode; void *devdata; - struct thermal_trip *trips; int num_trips; unsigned long passive_delay_jiffies; unsigned long polling_delay_jiffies; @@ -193,10 +192,11 @@ struct thermal_zone_device { struct list_head node; struct delayed_work poll_queue; enum thermal_notify_event notify_event; + bool suspended; #ifdef CONFIG_THERMAL_DEBUGFS struct thermal_debugfs *debugfs; #endif - bool suspended; + struct thermal_trip trips[] __counted_by(num_trips); }; /** @@ -315,7 +315,7 @@ int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp); #ifdef CONFIG_THERMAL struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, - struct thermal_trip *trips, + const struct thermal_trip *trips, int num_trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, @@ -375,7 +375,7 @@ void thermal_zone_device_critical(struct thermal_zone_device *tz); #else static inline struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, - struct thermal_trip *trips, + const struct thermal_trip *trips, int num_trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, -- cgit v1.2.3 From 698a1eb1f75eb6ac957d2af7721a3b1a94281e5d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 22 Feb 2024 18:18:01 +0100 Subject: thermal: core: Store zone ops in struct thermal_zone_device The current code requires thermal zone creators to pass pointers to writable ops structures to thermal_zone_device_register_with_trips() which needs to modify the target struct thermal_zone_device_ops object if the "critical" operation in it is NULL. Moreover, the callers of thermal_zone_device_register_with_trips() are required to hold on to the struct thermal_zone_device_ops object passed to it until the given thermal zone is unregistered. Both of these requirements are quite inconvenient, so modify struct thermal_zone_device to contain struct thermal_zone_device_ops as field and make thermal_zone_device_register_with_trips() copy the contents of the struct thermal_zone_device_ops passed to it via a pointer (which can be const now) to that field. Also adjust the code using thermal zone ops accordingly and modify thermal_of_zone_register() to use a local ops variable during thermal zone registration so ops do not need to be freed in thermal_of_zone_unregister() any more. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka Reviewed-by: Daniel Lezcano --- include/linux/thermal.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 572a24f29a10..ec0559e98d6f 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -182,7 +182,7 @@ struct thermal_zone_device { int prev_low_trip; int prev_high_trip; atomic_t need_update; - struct thermal_zone_device_ops *ops; + struct thermal_zone_device_ops ops; struct thermal_zone_params *tzp; struct thermal_governor *governor; void *governor_data; @@ -318,14 +318,14 @@ struct thermal_zone_device *thermal_zone_device_register_with_trips( const struct thermal_trip *trips, int num_trips, int mask, void *devdata, - struct thermal_zone_device_ops *ops, + const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, int passive_delay, int polling_delay); struct thermal_zone_device *thermal_tripless_zone_device_register( const char *type, void *devdata, - struct thermal_zone_device_ops *ops, + const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp); void thermal_zone_device_unregister(struct thermal_zone_device *tz); @@ -378,7 +378,7 @@ static inline struct thermal_zone_device *thermal_zone_device_register_with_trip const struct thermal_trip *trips, int num_trips, int mask, void *devdata, - struct thermal_zone_device_ops *ops, + const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, int passive_delay, int polling_delay) { return ERR_PTR(-ENODEV); } -- cgit v1.2.3 From d02c357e5bfa7dfd618b7b3015624beb71f58f1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 21 Feb 2024 17:26:40 -0800 Subject: KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing Retry page faults without acquiring mmu_lock, and without even faulting the page into the primary MMU, if the resolved gfn is covered by an active invalidation. Contending for mmu_lock is especially problematic on preemptible kernels as the mmu_notifier invalidation task will yield mmu_lock (see rwlock_needbreak()), delay the in-progress invalidation, and ultimately increase the latency of resolving the page fault. And in the worst case scenario, yielding will be accompanied by a remote TLB flush, e.g. if the invalidation covers a large range of memory and vCPUs are accessing addresses that were already zapped. Faulting the page into the primary MMU is similarly problematic, as doing so may acquire locks that need to be taken for the invalidation to complete (the primary MMU has finer grained locks than KVM's MMU), and/or may cause unnecessary churn (getting/putting pages, marking them accessed, etc). Alternatively, the yielding issue could be mitigated by teaching KVM's MMU iterators to perform more work before yielding, but that wouldn't solve the lock contention and would negatively affect scenarios where a vCPU is trying to fault in an address that is NOT covered by the in-progress invalidation. Add a dedicated lockess version of the range-based retry check to avoid false positives on the sanity check on start+end WARN, and so that it's super obvious that checking for a racing invalidation without holding mmu_lock is unsafe (though obviously useful). Wrap mmu_invalidate_in_progress in READ_ONCE() to ensure that pre-checking invalidation in a loop won't put KVM into an infinite loop, e.g. due to caching the in-progress flag and never seeing it go to '0'. Force a load of mmu_invalidate_seq as well, even though it isn't strictly necessary to avoid an infinite loop, as doing so improves the probability that KVM will detect an invalidation that already completed before acquiring mmu_lock and bailing anyways. Do the pre-check even for non-preemptible kernels, as waiting to detect the invalidation until mmu_lock is held guarantees the vCPU will observe the worst case latency in terms of handling the fault, and can generate even more mmu_lock contention. E.g. the vCPU will acquire mmu_lock, detect retry, drop mmu_lock, re-enter the guest, retake the fault, and eventually re-acquire mmu_lock. This behavior is also why there are no new starvation issues due to losing the fairness guarantees provided by rwlocks: if the vCPU needs to retry, it _must_ drop mmu_lock, i.e. waiting on mmu_lock doesn't guarantee forward progress in the face of _another_ mmu_notifier invalidation event. Note, adding READ_ONCE() isn't entirely free, e.g. on x86, the READ_ONCE() may generate a load into a register instead of doing a direct comparison (MOV+TEST+Jcc instead of CMP+Jcc), but practically speaking the added cost is a few bytes of code and maaaaybe a cycle or three. Reported-by: Yan Zhao Closes: https://lore.kernel.org/all/ZNnPF4W26ZbAyGto@yzhao56-desk.sh.intel.com Reported-by: Friedrich Weber Cc: Kai Huang Cc: Yan Zhao Cc: Yuan Yao Cc: Xu Yilun Acked-by: Kai Huang Reviewed-by: Yan Zhao Link: https://lore.kernel.org/r/20240222012640.2820927-1-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e7fd25b09b3..179df96b20f8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2031,6 +2031,32 @@ static inline int mmu_invalidate_retry_gfn(struct kvm *kvm, return 1; return 0; } + +/* + * This lockless version of the range-based retry check *must* be paired with a + * call to the locked version after acquiring mmu_lock, i.e. this is safe to + * use only as a pre-check to avoid contending mmu_lock. This version *will* + * get false negatives and false positives. + */ +static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, + unsigned long mmu_seq, + gfn_t gfn) +{ + /* + * Use READ_ONCE() to ensure the in-progress flag and sequence counter + * are always read from memory, e.g. so that checking for retry in a + * loop won't result in an infinite retry loop. Don't force loads for + * start+end, as the key to avoiding infinite retry loops is observing + * the 1=>0 transition of in-progress, i.e. getting false negatives + * due to stale start+end values is acceptable. + */ + if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) && + gfn >= kvm->mmu_invalidate_range_start && + gfn < kvm->mmu_invalidate_range_end) + return true; + + return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; +} #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING -- cgit v1.2.3 From 284851ee5caef1b42b513752bf1642ce4570bdc1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 16 Feb 2024 15:59:41 +0000 Subject: KVM: Get rid of return value from kvm_arch_create_vm_debugfs() The general expectation with debugfs is that any initialization failure is nonfatal. Nevertheless, kvm_arch_create_vm_debugfs() allows implementations to return an error and kvm_create_vm_debugfs() allows that to fail VM creation. Change to a void return to discourage architectures from making debugfs failures fatal for the VM. Seems like everyone already had the right idea, as all implementations already return 0 unconditionally. Acked-by: Marc Zyngier Acked-by: Paolo Bonzini Link: https://lore.kernel.org/r/20240216155941.2029458-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7e7fd25b09b3..9a45f673f687 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1507,7 +1507,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); -int kvm_arch_create_vm_debugfs(struct kvm *kvm); +void kvm_arch_create_vm_debugfs(struct kvm *kvm); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* -- cgit v1.2.3 From 025f8ad20f2e3264d11683aa9cbbf0083eefbdcd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 Feb 2024 15:03:10 +0100 Subject: net: mpls: error out if inner headers are not set mpls_gso_segment() assumes skb_inner_network_header() returns a valid result: mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb); if (unlikely(!mpls_hlen || mpls_hlen % MPLS_HLEN)) goto out; if (unlikely(!pskb_may_pull(skb, mpls_hlen))) With syzbot reproducer, skb_inner_network_header() yields 0, skb_network_header() returns 108, so this will "pskb_may_pull(skb, -108)))" which triggers a newly added DEBUG_NET_WARN_ON_ONCE() check: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 5068 at include/linux/skbuff.h:2723 pskb_may_pull_reason include/linux/skbuff.h:2723 [inline] WARNING: CPU: 0 PID: 5068 at include/linux/skbuff.h:2723 pskb_may_pull include/linux/skbuff.h:2739 [inline] WARNING: CPU: 0 PID: 5068 at include/linux/skbuff.h:2723 mpls_gso_segment+0x773/0xaa0 net/mpls/mpls_gso.c:34 [..] skb_mac_gso_segment+0x383/0x740 net/core/gso.c:53 nsh_gso_segment+0x40a/0xad0 net/nsh/nsh.c:108 skb_mac_gso_segment+0x383/0x740 net/core/gso.c:53 __skb_gso_segment+0x324/0x4c0 net/core/gso.c:124 skb_gso_segment include/net/gso.h:83 [inline] [..] sch_direct_xmit+0x11a/0x5f0 net/sched/sch_generic.c:327 [..] packet_sendmsg+0x46a9/0x6130 net/packet/af_packet.c:3113 [..] First iteration of this patch made mpls_hlen signed and changed test to error out to "mpls_hlen <= 0 || ..". Eric Dumazet said: > I was thinking about adding a debug check in skb_inner_network_header() > if inner_network_header is zero (that would mean it is not 'set' yet), > but this would trigger even after your patch. So add new skb_inner_network_header_was_set() helper and use that. The syzbot reproducer injects data via packet socket. The skb that gets allocated and passed down the stack has ->protocol set to NSH (0x894f) and gso_type set to SKB_GSO_UDP | SKB_GSO_DODGY. This gets passed to skb_mac_gso_segment(), which sees NSH as ptype to find a callback for. nsh_gso_segment() retrieves next type: proto = tun_p_to_eth_p(nsh_hdr(skb)->np); ... which is MPLS (TUN_P_MPLS_UC). It updates skb->protocol and then calls mpls_gso_segment(). Inner offsets are all 0, so mpls_gso_segment() ends up with a negative header size. In case more callers rely on silent handling of such large may_pull values we could also 'legalize' this behaviour, either replacing the debug check with (len > INT_MAX) test or removing it and instead adding a comment before existing if (unlikely(len > skb->len)) return SKB_DROP_REASON_PKT_TOO_SMALL; test in pskb_may_pull_reason(), saying that this check also implicitly takes care of callers that miscompute header sizes. Cc: Simon Horman Fixes: 219eee9c0d16 ("net: skbuff: add overflow debug check to pull/push helpers") Reported-by: syzbot+99d15fcdb0132a1e1a82@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/00000000000043b1310611e388aa@google.com/raw Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20240222140321.14080-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 28c7cb7ce251..1470b74fb6d2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2894,6 +2894,11 @@ static inline void skb_set_inner_network_header(struct sk_buff *skb, skb->inner_network_header += offset; } +static inline bool skb_inner_network_header_was_set(const struct sk_buff *skb) +{ + return skb->inner_network_header > 0; +} + static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb) { return skb->head + skb->inner_mac_header; -- cgit v1.2.3 From 31639fd6cebd4fc3687cceda14814f140c9fd95b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Jan 2024 11:07:01 +0100 Subject: stackdepot: use variable size records for non-evictable entries With the introduction of stack depot evictions, each stack record is now fixed size, so that future reuse after an eviction can safely store differently sized stack traces. In all cases that do not make use of evictions, this wastes lots of space. Fix it by re-introducing variable size stack records (up to the max allowed size) for entries that will never be evicted. We know if an entry will never be evicted if the flag STACK_DEPOT_FLAG_GET is not provided, since a later stack_depot_put() attempt is undefined behavior. With my current kernel config that enables KASAN and also SLUB owner tracking, I observe (after a kernel boot) a whopping reduction of 296 stack depot pools, which translates into 4736 KiB saved. The savings here are from SLUB owner tracking only, because KASAN generic mode still uses refcounting. Before: pools: 893 allocations: 29841 frees: 6524 in_use: 23317 freelist_size: 3454 After: pools: 597 refcounted_allocations: 17547 refcounted_frees: 6477 refcounted_in_use: 11070 freelist_size: 3497 persistent_count: 12163 persistent_bytes: 1717008 [elver@google.com: fix -Wstringop-overflow warning] Link: https://lore.kernel.org/all/20240201135747.18eca98e@canb.auug.org.au/ Link: https://lkml.kernel.org/r/20240201090434.1762340-1-elver@google.com Link: https://lore.kernel.org/all/CABXGCsOzpRPZGg23QqJAzKnqkZPKzvieeg=W7sgjgi3q0pBo0g@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240129100708.39460-1-elver@google.com Link: https://lore.kernel.org/all/CABXGCsOzpRPZGg23QqJAzKnqkZPKzvieeg=W7sgjgi3q0pBo0g@mail.gmail.com/ Fixes: 108be8def46e ("lib/stackdepot: allow users to evict stack traces") Signed-off-by: Marco Elver Reviewed-by: Andrey Konovalov Tested-by: Mikhail Gavrilov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Vincenzo Frascino Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/poison.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 27a7dad17eef..1f0ee2459f2a 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -92,4 +92,7 @@ /********** VFS **********/ #define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA)) +/********** lib/stackdepot.c **********/ +#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) + #endif -- cgit v1.2.3 From 8151c7a35d8bd8a12e93538ef7963ea209b6ab41 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:02 +0100 Subject: lib/stackdepot: move stack_record struct definition into the header In order to move the heavy lifting into page_owner code, this one needs to have access to the stack_record structure, which right now sits in lib/stackdepot.c. Move it to the stackdepot.h header so page_owner can access stack_record's struct fields. Link: https://lkml.kernel.org/r/20240215215907.20121-3-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Marco Elver Reviewed-by: Vlastimil Babka Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index adcbb8f23600..c4b5ad57c066 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -30,6 +30,53 @@ typedef u32 depot_stack_handle_t; */ #define STACK_DEPOT_EXTRA_BITS 5 +#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) + +#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ +#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) +#define DEPOT_STACK_ALIGN 4 +#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) +#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ + STACK_DEPOT_EXTRA_BITS) + +#ifdef CONFIG_STACKDEPOT +/* Compact structure that stores a reference to a stack. */ +union handle_parts { + depot_stack_handle_t handle; + struct { + /* pool_index is offset by 1 */ + u32 pool_index : DEPOT_POOL_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; + }; +}; + +struct stack_record { + struct list_head hash_list; /* Links in the hash table */ + u32 hash; /* Hash in hash table */ + u32 size; /* Number of stored frames */ + union handle_parts handle; /* Constant after initialization */ + refcount_t count; + union { + unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ + struct { + /* + * An important invariant of the implementation is to + * only place a stack record onto the freelist iff its + * refcount is zero. Because stack records with a zero + * refcount are never considered as valid, it is safe to + * union @entries and freelist management state below. + * Conversely, as soon as an entry is off the freelist + * and its refcount becomes non-zero, the below must not + * be accessed until being placed back on the freelist. + */ + struct list_head free_list; /* Links in the freelist */ + unsigned long rcu_state; /* RCU cookie */ + }; + }; +}; +#endif + typedef u32 depot_flags_t; /* -- cgit v1.2.3 From 4bedfb314bdd85c1662ecc46fa25b33b998f994d Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Feb 2024 22:59:03 +0100 Subject: mm,page_owner: maintain own list of stack_records structs page_owner needs to increment a stack_record refcount when a new allocation occurs, and decrement it on a free operation. In order to do that, we need to have a way to get a stack_record from a handle. Implement __stack_depot_get_stack_record() which just does that, and make it public so page_owner can use it. Also, traversing all stackdepot buckets comes with its own complexity, plus we would have to implement a way to mark only those stack_records that were originated from page_owner, as those are the ones we are interested in. For that reason, page_owner maintains its own list of stack_records, because traversing that list is faster than traversing all buckets while keeping at the same time a low complexity. For now, add to stack_list only the stack_records of dummy_handle and failure_handle, and set their refcount of 1. Further patches will add code to increment or decrement stack_records count on allocation and free operation. Link: https://lkml.kernel.org/r/20240215215907.20121-4-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Reviewed-by: Marco Elver Acked-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index c4b5ad57c066..3c6caa5abc7c 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -178,6 +178,17 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); +/** + * __stack_depot_get_stack_record - Get a pointer to a stack_record struct + * + * @handle: Stack depot handle + * + * This function is only for internal purposes. + * + * Return: Returns a pointer to a stack_record struct + */ +struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle); + /** * stack_depot_fetch - Fetch a stack trace from stack depot * -- cgit v1.2.3 From 55c49fee57af99f3c663e69dedc5b85e691bbe50 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 2 Jan 2024 19:46:27 +0100 Subject: mm/vmalloc: remove vmap_area_list Earlier, vmap_area_list is exported to vmcoreinfo so that makedumpfile get the base address of vmalloc area. Now, vmap_area_list is empty, so export VMALLOC_START to vmcoreinfo instead, and remove vmap_area_list. [urezki@gmail.com: fix a warning in the crash_save_vmcoreinfo_init()] Link: https://lkml.kernel.org/r/20240111192329.449189-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20240102184633.748113-6-urezki@gmail.com Signed-off-by: Baoquan He Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Lorenzo Stoakes Cc: Christoph Hellwig Cc: Dave Chinner Cc: Joel Fernandes (Google) Cc: Kazuhito Hagio Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Cc: Paul E. McKenney Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8dd..91810b4e9510 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); /* * Internals. Don't use.. */ -extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); -- cgit v1.2.3 From 85fcde402db191b5f222ebfecda653777d7d084e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:41 +0800 Subject: kexec: split crashkernel reservation code out from crash_core.c Patch series "Split crash out from kexec and clean up related config items", v3. Motivation: ============= Previously, LKP reported a building error. When investigating, it can't be resolved reasonablly with the present messy kdump config items. https://lore.kernel.org/oe-kbuild-all/202312182200.Ka7MzifQ-lkp@intel.com/ The kdump (crash dumping) related config items could causes confusions: Firstly, CRASH_CORE enables codes including - crashkernel reservation; - elfcorehdr updating; - vmcoreinfo exporting; - crash hotplug handling; Now fadump of powerpc, kcore dynamic debugging and kdump all selects CRASH_CORE, while fadump - fadump needs crashkernel parsing, vmcoreinfo exporting, and accessing global variable 'elfcorehdr_addr'; - kcore only needs vmcoreinfo exporting; - kdump needs all of the current kernel/crash_core.c. So only enabling PROC_CORE or FA_DUMP will enable CRASH_CORE, this mislead people that we enable crash dumping, actual it's not. Secondly, It's not reasonable to allow KEXEC_CORE select CRASH_CORE. Because KEXEC_CORE enables codes which allocate control pages, copy kexec/kdump segments, and prepare for switching. These codes are shared by both kexec reboot and kdump. We could want kexec reboot, but disable kdump. In that case, CRASH_CORE should not be selected. -------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y --------------------- Thirdly, It's not reasonable to allow CRASH_DUMP select KEXEC_CORE. That could make KEXEC_CORE, CRASH_DUMP are enabled independently from KEXEC or KEXEC_FILE. However, w/o KEXEC or KEXEC_FILE, the KEXEC_CORE code built in doesn't make any sense because no kernel loading or switching will happen to utilize the KEXEC_CORE code. --------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_CRASH_DUMP=y --------------------- In this case, what is worse, on arch sh and arm, KEXEC relies on MMU, while CRASH_DUMP can still be enabled when !MMU, then compiling error is seen as the lkp test robot reported in above link. ------arch/sh/Kconfig------ config ARCH_SUPPORTS_KEXEC def_bool MMU config ARCH_SUPPORTS_CRASH_DUMP def_bool BROKEN_ON_SMP --------------------------- Changes: =========== 1, split out crash_reserve.c from crash_core.c; 2, split out vmcore_infoc. from crash_core.c; 3, move crash related codes in kexec_core.c into crash_core.c; 4, remove dependency of FA_DUMP on CRASH_DUMP; 5, clean up kdump related config items; 6, wrap up crash codes in crash related ifdefs on all 8 arch-es which support crash dumping, except of ppc; Achievement: =========== With above changes, I can rearrange the config item logic as below (the right item depends on or is selected by the left item): PROC_KCORE -----------> VMCORE_INFO |----------> VMCORE_INFO FA_DUMP----| |----------> CRASH_RESERVE ---->VMCORE_INFO / |---->CRASH_RESERVE KEXEC --| /| |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE KEXEC_FILE --| \ | \---->CRASH_HOTPLUG KEXEC --| |--> KEXEC_CORE (for kexec reboot only) KEXEC_FILE --| Test ======== On all 8 architectures, including x86_64, arm64, s390x, sh, arm, mips, riscv, loongarch, I did below three cases of config item setting and building all passed. Take configs on x86_64 as exampmle here: (1) Both CONFIG_KEXEC and KEXEC_FILE is unset, then all kexec/kdump items are unset automatically: # Kexec and crash features # CONFIG_KEXEC is not set # CONFIG_KEXEC_FILE is not set # end of Kexec and crash features (2) set CONFIG_KEXEC_FILE and 'make olddefconfig': --------------- # Kexec and crash features CONFIG_CRASH_RESERVE=y CONFIG_VMCORE_INFO=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC_FILE=y CONFIG_CRASH_DUMP=y CONFIG_CRASH_HOTPLUG=y CONFIG_CRASH_MAX_MEMORY_RANGES=8192 # end of Kexec and crash features --------------- (3) unset CONFIG_CRASH_DUMP in case 2 and execute 'make olddefconfig': ------------------------ # Kexec and crash features CONFIG_KEXEC_CORE=y CONFIG_KEXEC_FILE=y # end of Kexec and crash features ------------------------ Note: For ppc, it needs investigation to make clear how to split out crash code in arch folder. Hope Hari and Pingfan can help have a look, see if it's doable. Now, I make it either have both kexec and crash enabled, or disable both of them altogether. This patch (of 14): Both kdump and fa_dump of ppc rely on crashkernel reservation. Move the relevant codes into separate files: crash_reserve.c, include/linux/crash_reserve.h. And also add config item CRASH_RESERVE to control its enabling of the codes. And update config items which has relationship with crashkernel reservation. And also change ifdeffery from CONFIG_CRASH_CORE to CONFIG_CRASH_RESERVE when those scopes are only crashkernel reservation related. And also rename arch/XXX/include/asm/{crash_core.h => crash_reserve.h} on arm64, x86 and risc-v because those architectures' crash_core.h is only related to crashkernel reservation. [akpm@linux-foundation.org: s/CRASH_RESEERVE/CRASH_RESERVE/, per Klara Modin] Link: https://lkml.kernel.org/r/20240124051254.67105-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20240124051254.67105-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Hari Bathini Cc: Al Viro Cc: Eric W. Biederman Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 40 ------------------------------------ include/linux/crash_reserve.h | 48 +++++++++++++++++++++++++++++++++++++++++++ include/linux/kexec.h | 1 + 3 files changed, 49 insertions(+), 40 deletions(-) create mode 100644 include/linux/crash_reserve.h (limited to 'include/linux') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 9eaeaafe0cad..1fde49246fa6 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -5,14 +5,6 @@ #include #include #include -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#include -#endif - -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; #define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) @@ -87,38 +79,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); -int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base, - unsigned long long *low_size, bool *high); - -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) -#endif -#ifndef CRASH_ALIGN -#define CRASH_ALIGN SZ_2M -#endif -#ifndef CRASH_ADDR_LOW_MAX -#define CRASH_ADDR_LOW_MAX SZ_4G -#endif -#ifndef CRASH_ADDR_HIGH_MAX -#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() -#endif - -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high); -#else -static inline void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) -{} -#endif - /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h new file mode 100644 index 000000000000..5a9df944fb80 --- /dev/null +++ b/include/linux/crash_reserve.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_CRASH_RESERVE_H +#define LINUX_CRASH_RESERVE_H + +#include +#include +#include +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#include +#endif + +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base, + unsigned long long *low_size, bool *high); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) +#endif +#ifndef CRASH_ALIGN +#define CRASH_ALIGN SZ_2M +#endif +#ifndef CRASH_ADDR_LOW_MAX +#define CRASH_ADDR_LOW_MAX SZ_4G +#endif +#ifndef CRASH_ADDR_HIGH_MAX +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() +#endif + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high); +#else +static inline void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{} +#endif +#endif /* LINUX_CRASH_RESERVE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 400cb6c02176..6d79bfb52e5b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -16,6 +16,7 @@ #if !defined(__ASSEMBLY__) #include +#include #include #include -- cgit v1.2.3 From 443cbaf9e2fdbef7d7cae457434a6cb8a679441b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:42 +0800 Subject: crash: split vmcoreinfo exporting code out from crash_core.c Now move the relevant codes into separate files: kernel/crash_reserve.c, include/linux/crash_reserve.h. And add config item CRASH_RESERVE to control its enabling. And also update the old ifdeffery of CONFIG_CRASH_CORE, including of and config item dependency on CRASH_CORE accordingly. And also do renaming as follows: - arch/xxx/kernel/{crash_core.c => vmcore_info.c} because they are only related to vmcoreinfo exporting on x86, arm64, riscv. And also Remove config item CRASH_CORE, and rely on CONFIG_KEXEC_CORE to decide if build in crash_core.c. [yang.lee@linux.alibaba.com: remove duplicated include in vmcore_info.c] Link: https://lkml.kernel.org/r/20240126005744.16561-1-yang.lee@linux.alibaba.com Link: https://lkml.kernel.org/r/20240124051254.67105-3-bhe@redhat.com Signed-off-by: Baoquan He Signed-off-by: Yang Li Acked-by: Hari Bathini Cc: Al Viro Cc: Eric W. Biederman Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- include/linux/buildid.h | 2 +- include/linux/crash_core.h | 73 ---------------------------------------- include/linux/kexec.h | 1 + include/linux/vmcore_info.h | 81 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 74 deletions(-) create mode 100644 include/linux/vmcore_info.h (limited to 'include/linux') diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 8a582d242f06..20aa3c2d89f7 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -11,7 +11,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE) +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX]; void init_vmlinux_build_id(void); #else diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 1fde49246fa6..7f19f62018ef 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -6,79 +6,6 @@ #include #include -#define CRASH_CORE_NOTE_NAME "CORE" -#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) -#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) - -/* - * The per-cpu notes area is a list of notes terminated by a "NULL" - * note header. For kdump, the code in vmcore.c runs in the context - * of the second kernel to combine them into one note. - */ -#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ - CRASH_CORE_NOTE_NAME_BYTES + \ - CRASH_CORE_NOTE_DESC_BYTES) - -#define VMCOREINFO_BYTES PAGE_SIZE -#define VMCOREINFO_NOTE_NAME "VMCOREINFO" -#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) -#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ - VMCOREINFO_NOTE_NAME_BYTES + \ - VMCOREINFO_BYTES) - -typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; -/* Per cpu memory for storing cpu states in case of system crash. */ -extern note_buf_t __percpu *crash_notes; - -void crash_update_vmcoreinfo_safecopy(void *ptr); -void crash_save_vmcoreinfo(void); -void arch_crash_save_vmcoreinfo(void); -__printf(1, 2) -void vmcoreinfo_append_str(const char *fmt, ...); -phys_addr_t paddr_vmcoreinfo_note(void); - -#define VMCOREINFO_OSRELEASE(value) \ - vmcoreinfo_append_str("OSRELEASE=%s\n", value) -#define VMCOREINFO_BUILD_ID() \ - ({ \ - static_assert(sizeof(vmlinux_build_id) == 20); \ - vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ - }) - -#define VMCOREINFO_PAGESIZE(value) \ - vmcoreinfo_append_str("PAGESIZE=%ld\n", value) -#define VMCOREINFO_SYMBOL(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) -#define VMCOREINFO_SYMBOL_ARRAY(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) -#define VMCOREINFO_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(name)) -#define VMCOREINFO_STRUCT_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(struct name)) -#define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -#define VMCOREINFO_TYPE_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(name, field)) -#define VMCOREINFO_LENGTH(name, value) \ - vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) -#define VMCOREINFO_NUMBER(name) \ - vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) -#define VMCOREINFO_CONFIG(name) \ - vmcoreinfo_append_str("CONFIG_%s=y\n", #name) - -extern unsigned char *vmcoreinfo_data; -extern size_t vmcoreinfo_size; -extern u32 *vmcoreinfo_note; - -Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, - void *data, size_t data_len); -void final_note(Elf_Word *buf); - /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 6d79bfb52e5b..9c7bb8b56ed6 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -16,6 +16,7 @@ #if !defined(__ASSEMBLY__) #include +#include #include #include #include diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h new file mode 100644 index 000000000000..e1dec1a6a749 --- /dev/null +++ b/include/linux/vmcore_info.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VMCORE_INFO_H +#define LINUX_VMCORE_INFO_H + +#include +#include +#include + +#define CRASH_CORE_NOTE_NAME "CORE" +#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) + +/* + * The per-cpu notes area is a list of notes terminated by a "NULL" + * note header. For kdump, the code in vmcore.c runs in the context + * of the second kernel to combine them into one note. + */ +#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + CRASH_CORE_NOTE_NAME_BYTES + \ + CRASH_CORE_NOTE_DESC_BYTES) + +#define VMCOREINFO_BYTES PAGE_SIZE +#define VMCOREINFO_NOTE_NAME "VMCOREINFO" +#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + VMCOREINFO_NOTE_NAME_BYTES + \ + VMCOREINFO_BYTES) + +typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; +/* Per cpu memory for storing cpu states in case of system crash. */ +extern note_buf_t __percpu *crash_notes; + +void crash_update_vmcoreinfo_safecopy(void *ptr); +void crash_save_vmcoreinfo(void); +void arch_crash_save_vmcoreinfo(void); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); +phys_addr_t paddr_vmcoreinfo_note(void); + +#define VMCOREINFO_OSRELEASE(value) \ + vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_BUILD_ID() \ + ({ \ + static_assert(sizeof(vmlinux_build_id) == 20); \ + vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ + }) + +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ARRAY(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) +#define VMCOREINFO_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(name)) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_TYPE_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(name, field)) +#define VMCOREINFO_LENGTH(name, value) \ + vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) +#define VMCOREINFO_NUMBER(name) \ + vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) +#define VMCOREINFO_CONFIG(name) \ + vmcoreinfo_append_str("CONFIG_%s=y\n", #name) + +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; +extern u32 *vmcoreinfo_note; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); +#endif /* LINUX_VMCORE_INFO_H */ -- cgit v1.2.3 From 02aff8480533817a29e820729360866441d7403d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:44 +0800 Subject: crash: split crash dumping code out from kexec_core.c Currently, KEXEC_CORE select CRASH_CORE automatically because crash codes need be built in to avoid compiling error when building kexec code even though the crash dumping functionality is not enabled. E.g -------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y --------------------- After splitting out crashkernel reservation code and vmcoreinfo exporting code, there's only crash related code left in kernel/crash_core.c. Now move crash related codes from kexec_core.c to crash_core.c and only build it in when CONFIG_CRASH_DUMP=y. And also wrap up crash codes inside CONFIG_CRASH_DUMP ifdeffery scope, or replace inappropriate CONFIG_KEXEC_CORE ifdef with CONFIG_CRASH_DUMP ifdef in generic kernel files. With these changes, crash_core codes are abstracted from kexec codes and can be disabled at all if only kexec reboot feature is wanted. Link: https://lkml.kernel.org/r/20240124051254.67105-5-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/kexec.h | 45 +--------------------------------- 2 files changed, 62 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 7f19f62018ef..23270b16e1db 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -6,6 +6,48 @@ #include #include +struct kimage; + +#ifdef CONFIG_CRASH_DUMP + +int crash_shrink_memory(unsigned long new_size); +ssize_t crash_get_memory_size(void); + +#ifndef arch_kexec_protect_crashkres +/* + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. + * + * Provide an empty default implementation here -- architecture + * code may override this + */ +static inline void arch_kexec_protect_crashkres(void) { } +#endif + +#ifndef arch_kexec_unprotect_crashkres +static inline void arch_kexec_unprotect_crashkres(void) { } +#endif + + + +#ifndef arch_crash_handle_hotplug_event +static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } +#endif + +int crash_check_update_elfcorehdr(void); + +#ifndef crash_hotplug_cpu_support +static inline int crash_hotplug_cpu_support(void) { return 0; } +#endif + +#ifndef crash_hotplug_memory_support +static inline int crash_hotplug_memory_support(void) { return 0; } +#endif + +#ifndef crash_get_elfcorehdr_size +static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } +#endif + /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 @@ -31,4 +73,23 @@ struct kexec_segment; #define KEXEC_CRASH_HP_REMOVE_MEMORY 4 #define KEXEC_CRASH_HP_INVALID_CPU -1U +extern void __crash_kexec(struct pt_regs *regs); +extern void crash_kexec(struct pt_regs *regs); +int kexec_should_crash(struct task_struct *p); +int kexec_crash_loaded(void); +void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); + +#else /* !CONFIG_CRASH_DUMP*/ +struct pt_regs; +struct task_struct; +struct kimage; +static inline void __crash_kexec(struct pt_regs *regs) { } +static inline void crash_kexec(struct pt_regs *regs) { } +static inline int kexec_should_crash(struct task_struct *p) { return 0; } +static inline int kexec_crash_loaded(void) { return 0; } +static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {}; +static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; }; +#endif /* CONFIG_CRASH_DUMP*/ + #endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9c7bb8b56ed6..060835bb82d5 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -15,7 +15,6 @@ #if !defined(__ASSEMBLY__) -#include #include #include #include @@ -33,6 +32,7 @@ extern note_buf_t __percpu *crash_notes; #include #include #include +#include /* Verify architecture specific macros are defined */ @@ -380,13 +380,6 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image, static inline int machine_kexec_post_load(struct kimage *image) { return 0; } #endif -extern void __crash_kexec(struct pt_regs *); -extern void crash_kexec(struct pt_regs *); -int kexec_should_crash(struct task_struct *); -int kexec_crash_loaded(void); -void crash_save_cpu(struct pt_regs *regs, int cpu); -extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); - extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -410,24 +403,6 @@ bool kexec_load_permitted(int kexec_image_type); /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; -int crash_shrink_memory(unsigned long new_size); -ssize_t crash_get_memory_size(void); - -#ifndef arch_kexec_protect_crashkres -/* - * Protection mechanism for crashkernel reserved memory after - * the kdump kernel is loaded. - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -static inline void arch_kexec_protect_crashkres(void) { } -#endif - -#ifndef arch_kexec_unprotect_crashkres -static inline void arch_kexec_unprotect_crashkres(void) { } -#endif - #ifndef page_to_boot_pfn static inline unsigned long page_to_boot_pfn(struct page *page) { @@ -484,24 +459,6 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } #endif -#ifndef arch_crash_handle_hotplug_event -static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } -#endif - -int crash_check_update_elfcorehdr(void); - -#ifndef crash_hotplug_cpu_support -static inline int crash_hotplug_cpu_support(void) { return 0; } -#endif - -#ifndef crash_hotplug_memory_support -static inline int crash_hotplug_memory_support(void) { return 0; } -#endif - -#ifndef crash_get_elfcorehdr_size -static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } -#endif - extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, ...) \ -- cgit v1.2.3 From 78f2f60377ee43b7f27b4584dae754b8aa3f80e1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:12 -0800 Subject: mm/damon/core: set damos_quota->esz as public field and document Patch series "mm/damon: let DAMOS feeds and tame/auto-tune itself". The Aim-oriented Feedback-driven DAMOS Aggressiveness Auto-tuning patchset[1] which has merged since commit 9294a037c015 ("mm/damon/core: implement goal-oriented feedback-driven quota auto-tuning") made the mechanism and the policy separated. That is, users can set a part of DAMOS control policies without a deep understanding of the mechanism but just their demands such as SLA. However, users are still required to do some additional work of manually collecting their target metric and feeding it to DAMOS. In the case of end-users who use DAMON sysfs interface, the context switches between user-space and kernel-space could also make it inefficient. The overhead is supposed to be only trivial in common cases, though. Meanwhile, in simple use cases, the target metric could be common system metrics that the kernel can efficiently self-retrieve, such as memory pressure stall time (PSI). Extend DAMOS quota auto-tuning to support multiple types of metrics including the DAMOS self-retrievable ones, and add support for memory pressure stall time metric. Different types of metrics can be supported in future. The auto-tuning capability is currently supported for only users of DAMOS kernel API and DAMON sysfs interface. Extend the support to DAMON_RECLAIM. Patches Sequence ================ First five patches are for helping debugging and fine-tuning existing quota control features. The first one (patch 1) exposes the effective quota that is made with given user inputs to DAMOS kernel API users and kernel-doc documents. Following four patches implement (patches 1, 2 and 3) and document (patches 4 and 5) a new DAMON sysfs file that exposes the value. Following six patches cleanup and simplify the existing DAMOS quota auto-tuning code by improving layout of comments and data structures (patches 6 and 7), supporting common use cases, namely multiple goals (patches 8, 9 and 10), and simplifying the interface (patch 11). Then six patches for the main purpose of this patchset follow. The first three changes extend the core logic for various target metrics (patch 12), implement memory pressure stall time-based target metric support (patch 13), and update DAMON sysfs interface to support the new target metric (patch 14). Then, documentation updates for the features on design (patch 15), ABI (patch 16), and usage (patch 17) follow. Last three patches add auto-tuning support on DAMON_RECLAIM. The patches implement DAMON_RECLAIM parameters for user-feedback driven quota auto-tuning (patch 18), memory pressure stall time-driven quota self-tuning (patch 19), and finally update the DAMON_RECLAIM usage document for the new parameters (patch 20). [1] https://lore.kernel.org/all/20231130023652.50284-1-sj@kernel.org/ This patch (of 20): DAMOS allow users to specify the quota as they want in multiple ways including time quota, size quota, and feedback-based auto-tuning. DAMOS makes one effective quota out of the inputs and use it at the end. Knowing the current effective quota helps understanding DAMOS' internal mechanism and fine-tuning quotas. DAMON kernel API users can get the information from ->esz field of damos_quota struct, but the field is marked as private purpose, and not kernel-doc documented. Make it public and document. Link: https://lkml.kernel.org/r/20240219194431.159606-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240219194431.159606-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5881e4ac30be..93ef45b87b9c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -138,6 +138,7 @@ enum damos_action { * * @get_score: Feedback function for self-tuning quota. * @get_score_arg: Parameter for @get_score + * @esz: Effective size quota in bytes. * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or @@ -167,6 +168,8 @@ enum damos_action { * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are * set together, those work as a hard limit quota. If neither @ms nor @sz are * set, the mechanism starts from the quota of one byte. + * + * The resulting effective size quota in bytes is set to @esz. */ struct damos_quota { unsigned long ms; @@ -179,14 +182,13 @@ struct damos_quota { unsigned long (*get_score)(void *arg); void *get_score_arg; + unsigned long esz; /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; unsigned long total_charged_ns; - unsigned long esz; /* Effective size quota in bytes */ - /* For charging the quota */ unsigned long charged_sz; unsigned long charged_from; -- cgit v1.2.3 From 4d791a0a2ab47d70131909942009f12f61db20ab Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:17 -0800 Subject: mm/damon: move comments and fields for damos-quota-prioritization to the end The comments and definition of 'struct damos_quota' lists a few fields for effective quota generation first, fields for regions prioritization under the quota, and then remaining fields for effective quota generation. Readers' should unnecesssarily switch their context in the middle. List all the fields for the effective quota first, and then fields for the prioritization for making it easier to read. Link: https://lkml.kernel.org/r/20240219194431.159606-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 93ef45b87b9c..bd17b14828bc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -128,18 +128,17 @@ enum damos_action { /** * struct damos_quota - Controls the aggressiveness of the given scheme. + * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. - * @reset_interval: Charge reset interval in milliseconds. + * @get_score: Feedback function for self-tuning quota. + * @get_score_arg: Parameter for @get_score + * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. * @weight_age: Weight of the region's age for prioritization. * - * @get_score: Feedback function for self-tuning quota. - * @get_score_arg: Parameter for @get_score - * @esz: Effective size quota in bytes. - * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -152,12 +151,6 @@ enum damos_action { * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. * - * For selecting regions within the quota, DAMON prioritizes current scheme's - * target memory regions using the &struct damon_operations->get_scheme_score. - * You could customize the prioritization logic by setting &weight_sz, - * &weight_nr_accesses, and &weight_age, because monitoring operations are - * encouraged to respect those. - * * If @get_score function pointer is set, DAMON calls it back with * @get_score_arg and get the return value of it for every @reset_interval. * Then, DAMON adjusts the effective quota using the return value as a feedback @@ -170,20 +163,25 @@ enum damos_action { * set, the mechanism starts from the quota of one byte. * * The resulting effective size quota in bytes is set to @esz. + * + * For selecting regions within the quota, DAMON prioritizes current scheme's + * target memory regions using the &struct damon_operations->get_scheme_score. + * You could customize the prioritization logic by setting &weight_sz, + * &weight_nr_accesses, and &weight_age, because monitoring operations are + * encouraged to respect those. */ struct damos_quota { + unsigned long reset_interval; unsigned long ms; unsigned long sz; - unsigned long reset_interval; + unsigned long (*get_score)(void *arg); + void *get_score_arg; + unsigned long esz; unsigned int weight_sz; unsigned int weight_nr_accesses; unsigned int weight_age; - unsigned long (*get_score)(void *arg); - void *get_score_arg; - unsigned long esz; - /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; -- cgit v1.2.3 From 106e26fc1c4c1a0e3747246e15df2bc3aa9d46b2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:18 -0800 Subject: mm/damon/core: split out quota goal related fields to a struct 'struct damos_quota' is not small now. Split out fields for quota goal to a separate struct for easier reading. Link: https://lkml.kernel.org/r/20240219194431.159606-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index bd17b14828bc..2fe345adf6b2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -126,13 +126,28 @@ enum damos_action { NR_DAMOS_ACTIONS, }; +/** + * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. + * @get_score: Function for getting current score of the goal. + * @get_score_arg: Parameter for @get_score + * + * Data structure for getting the current score of the quota tuning goal. + * Calling @get_score with @get_score_arg as the parameter should return the + * current score. Then the score is entered to DAMON's internal feedback loop + * mechanism to get the auto-tuned quota. The goal of the tuning is getting + * the feedback score value of 10,000. + */ +struct damos_quota_goal { + unsigned long (*get_score)(void *arg); + void *get_score_arg; +}; + /** * struct damos_quota - Controls the aggressiveness of the given scheme. * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. - * @get_score: Feedback function for self-tuning quota. - * @get_score_arg: Parameter for @get_score + * @goal: Quota auto-tuning goal. * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. @@ -151,16 +166,10 @@ enum damos_action { * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. * - * If @get_score function pointer is set, DAMON calls it back with - * @get_score_arg and get the return value of it for every @reset_interval. - * Then, DAMON adjusts the effective quota using the return value as a feedback - * score to the current quota, using its internal feedback loop algorithm. - * - * The feedback loop algorithem assumes the quota input and the feedback score - * output are in a positive proportional relationship, and the goal of the - * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are - * set together, those work as a hard limit quota. If neither @ms nor @sz are - * set, the mechanism starts from the quota of one byte. + * If ->get_score field of @goal is set, DAMON calculates yet another size + * quota based on the goal using its internal feedback loop algorithm, for + * every @reset_interval. Then, if the new size quota is smaller than the + * effective quota, it uses the new size quota as the effective quota. * * The resulting effective size quota in bytes is set to @esz. * @@ -174,8 +183,7 @@ struct damos_quota { unsigned long reset_interval; unsigned long ms; unsigned long sz; - unsigned long (*get_score)(void *arg); - void *get_score_arg; + struct damos_quota_goal goal; unsigned long esz; unsigned int weight_sz; -- cgit v1.2.3 From 91f21216a79d00f7da380ed4ce100e8a7a0d0cff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:19 -0800 Subject: mm/damon/core: add multiple goals per damos_quota and helpers for those The feedback-driven DAMOS quota auto-tuning feature allows only single goal to the DAMON kernel API users. The API users could implement multiple goals for the end-users on their level, and that's what DAMON sysfs interface is doing. More DAMON kernel API users such as DAMON_RECLAIM would need to do similar work. To reduce unnecessary future duplciated efforts, support multiple goals from DAMOS core layer. To make the support in minimum non-destructive change, keep the old single goal setup interface, and add multiple goals setup. The single goal will treated as one of the multiple goals, so old API users are not required to make any change. Link: https://lkml.kernel.org/r/20240219194431.159606-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2fe345adf6b2..4bd898eaf80e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -130,6 +130,7 @@ enum damos_action { * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. * @get_score: Function for getting current score of the goal. * @get_score_arg: Parameter for @get_score + * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. * Calling @get_score with @get_score_arg as the parameter should return the @@ -140,6 +141,7 @@ enum damos_action { struct damos_quota_goal { unsigned long (*get_score)(void *arg); void *get_score_arg; + struct list_head list; }; /** @@ -148,6 +150,7 @@ struct damos_quota_goal { * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. * @goal: Quota auto-tuning goal. + * @goals: Head of quota tuning goals (&damos_quota_goal) list. * @esz: Effective size quota in bytes. * * @weight_sz: Weight of the region's size for prioritization. @@ -171,6 +174,8 @@ struct damos_quota_goal { * every @reset_interval. Then, if the new size quota is smaller than the * effective quota, it uses the new size quota as the effective quota. * + * If @goals is not empty, same action is taken for each goal of the list. + * * The resulting effective size quota in bytes is set to @esz. * * For selecting regions within the quota, DAMON prioritizes current scheme's @@ -184,6 +189,7 @@ struct damos_quota { unsigned long ms; unsigned long sz; struct damos_quota_goal goal; + struct list_head goals; unsigned long esz; unsigned int weight_sz; @@ -648,6 +654,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damon_for_each_scheme_safe(s, next, ctx) \ list_for_each_entry_safe(s, next, &(ctx)->schemes, list) +#define damos_for_each_quota_goal(goal, quota) \ + list_for_each_entry(goal, "a->goals, list) + +#define damos_for_each_quota_goal_safe(goal, next, quota) \ + list_for_each_entry_safe(goal, next, &(quota)->goals, list) + #define damos_for_each_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->filters, list) @@ -681,6 +693,11 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); +struct damos_quota_goal *damos_new_quota_goal( + unsigned long (*get_score)(void *), void *get_score_arg); +void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g); +void damos_destroy_quota_goal(struct damos_quota_goal *goal); + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, unsigned long apply_interval_us, -- cgit v1.2.3 From 89d347a545a704e0bd4fc61f9aea956d71bc72d2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:21 -0800 Subject: mm/damon/core: remove ->goal field of damos_quota DAMOS quota auto-tuning feature supports static signle goal and dynamic multiple goals via DAMON kernel API, specifically via ->goal and ->goals fields of damos_quota struct, respectively. All in-tree DAMOS kernel API users are using only the dynamic multiple goals now. Remove the unsued static single goal interface. Link: https://lkml.kernel.org/r/20240219194431.159606-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 4bd898eaf80e..76c965c1eea3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -149,7 +149,6 @@ struct damos_quota_goal { * @reset_interval: Charge reset interval in milliseconds. * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. - * @goal: Quota auto-tuning goal. * @goals: Head of quota tuning goals (&damos_quota_goal) list. * @esz: Effective size quota in bytes. * @@ -169,12 +168,10 @@ struct damos_quota_goal { * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. * - * If ->get_score field of @goal is set, DAMON calculates yet another size - * quota based on the goal using its internal feedback loop algorithm, for - * every @reset_interval. Then, if the new size quota is smaller than the - * effective quota, it uses the new size quota as the effective quota. - * - * If @goals is not empty, same action is taken for each goal of the list. + * If @goals is not empt, DAMON calculates yet another size quota based on the + * goals using its internal feedback loop algorithm, for every @reset_interval. + * Then, if the new size quota is smaller than the effective quota, it uses the + * new size quota as the effective quota. * * The resulting effective size quota in bytes is set to @esz. * @@ -188,7 +185,6 @@ struct damos_quota { unsigned long reset_interval; unsigned long ms; unsigned long sz; - struct damos_quota_goal goal; struct list_head goals; unsigned long esz; -- cgit v1.2.3 From 06ba5b309ed870cf1a0fedc611d0e7fbb6425a2d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:22 -0800 Subject: mm/damon/core: let goal specified with only target and current values DAMOS quota auto-tuning feature let users to set the goal by providing a function for getting the current score of the tuned quota. It allows flexible goal setup, but only simple user-set quota is currently being used. As a result, the only user of the DAMOS quota auto-tuning is using a silly void pointer casting based score value passing function. Simplify the interface and the user code by letting user directly set the target and the current value. Link: https://lkml.kernel.org/r/20240219194431.159606-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 76c965c1eea3..de0cdc7f96d2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -128,19 +128,18 @@ enum damos_action { /** * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. - * @get_score: Function for getting current score of the goal. - * @get_score_arg: Parameter for @get_score + * @target_value: Target value to achieve with the tuning. + * @current_value: Current value that achieving with the tuning. * @list: List head for siblings. * - * Data structure for getting the current score of the quota tuning goal. - * Calling @get_score with @get_score_arg as the parameter should return the - * current score. Then the score is entered to DAMON's internal feedback loop - * mechanism to get the auto-tuned quota. The goal of the tuning is getting - * the feedback score value of 10,000. + * Data structure for getting the current score of the quota tuning goal. The + * score is calculated by how close @current_value and @target_value are. Then + * the score is entered to DAMON's internal feedback loop mechanism to get the + * auto-tuned quota. */ struct damos_quota_goal { - unsigned long (*get_score)(void *arg); - void *get_score_arg; + unsigned long target_value; + unsigned long current_value; struct list_head list; }; @@ -690,7 +689,7 @@ void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); struct damos_quota_goal *damos_new_quota_goal( - unsigned long (*get_score)(void *), void *get_score_arg); + unsigned long target_value, unsigned long current_value); void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g); void damos_destroy_quota_goal(struct damos_quota_goal *goal); -- cgit v1.2.3 From bcce9bc16f56fbc254857fcb31674ab868b824d7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:23 -0800 Subject: mm/damon/core: support multiple metrics for quota goal DAMOS quota auto-tuning asks users to assess the current tuned quota and provide the feedback in a manual and repeated way. It allows users generate the feedback from a source that the kernel cannot access, and writing a script or a function for doing the manual and repeated feeding is not a big deal. However, additional works are additional works, and it could be more efficient if DAMOS could do the fetch itself, especially in case of DAMON sysfs interface use case, since it can avoid the context switches between the user-space and the kernel-space, though the overhead would be only trivial in most cases. Also in many cases, feedbacks could be made from kernel-accessible sources, such as PSI, CPU usage, etc. Make the quota goal to support multiple types of metrics including such ones. Link: https://lkml.kernel.org/r/20240219194431.159606-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index de0cdc7f96d2..5a06993d8479 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -126,18 +126,37 @@ enum damos_action { NR_DAMOS_ACTIONS, }; +/** + * enum damos_quota_goal_metric - Represents the metric to be used as the goal + * + * @DAMOS_QUOTA_USER_INPUT: User-input value. + * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. + * + * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. + */ +enum damos_quota_goal_metric { + DAMOS_QUOTA_USER_INPUT, + NR_DAMOS_QUOTA_GOAL_METRICS, +}; + /** * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal. - * @target_value: Target value to achieve with the tuning. - * @current_value: Current value that achieving with the tuning. + * @metric: Metric to be used for representing the goal. + * @target_value: Target value of @metric to achieve with the tuning. + * @current_value: Current value of @metric. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The * score is calculated by how close @current_value and @target_value are. Then * the score is entered to DAMON's internal feedback loop mechanism to get the * auto-tuned quota. + * + * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually + * entered by the user, probably inside the kdamond callbacks. Otherwise, + * DAMON sets @current_value with self-measured value of @metric. */ struct damos_quota_goal { + enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; struct list_head list; @@ -689,7 +708,8 @@ void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); struct damos_quota_goal *damos_new_quota_goal( - unsigned long target_value, unsigned long current_value); + enum damos_quota_goal_metric metric, + unsigned long target_value); void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g); void damos_destroy_quota_goal(struct damos_quota_goal *goal); -- cgit v1.2.3 From 2dbb60f789cbb5c0000a4664f40f9358b3a62ba2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 19 Feb 2024 11:44:24 -0800 Subject: mm/damon/core: implement PSI metric DAMOS quota goal Extend DAMOS quota goal metric with system wide memory pressure stall time. Specifically, the system level 'some' PSI for memory is used. The target value can be set in microseconds. DAMOS measures the increased amount of the PSI metric in last quota_reset_interval and use the ratio of it versus the user-specified target PSI value as the score for the auto-tuning feedback loop. Link: https://lkml.kernel.org/r/20240219194431.159606-14-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5a06993d8479..886d07294f4e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -130,12 +130,14 @@ enum damos_action { * enum damos_quota_goal_metric - Represents the metric to be used as the goal * * @DAMOS_QUOTA_USER_INPUT: User-input value. + * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. */ enum damos_quota_goal_metric { DAMOS_QUOTA_USER_INPUT, + DAMOS_QUOTA_SOME_MEM_PSI_US, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -144,6 +146,7 @@ enum damos_quota_goal_metric { * @metric: Metric to be used for representing the goal. * @target_value: Target value of @metric to achieve with the tuning. * @current_value: Current value of @metric. + * @last_psi_total: Last measured total PSI * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -159,6 +162,10 @@ struct damos_quota_goal { enum damos_quota_goal_metric metric; unsigned long target_value; unsigned long current_value; + /* metric-dependent fields */ + union { + u64 last_psi_total; + }; struct list_head list; }; -- cgit v1.2.3 From 2a6e1a8f4cf39cd3d10c52fca639a7d6f30b7004 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:37 +0100 Subject: writeback: remove a duplicate prototype for tag_pages_for_writeback [hch@lst.de: split from a larger patch] Link: https://lkml.kernel.org/r/20240215063649.2164017-3-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/writeback.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 453736fd1d23..4b8cf9e4810b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -363,8 +363,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); -void tag_pages_for_writeback(struct address_space *mapping, - pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); -- cgit v1.2.3 From 751e0d559c62a87dc828af22c3c58dc078c734e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:43 +0100 Subject: writeback: factor writeback_get_batch() out of write_cache_pages() This simple helper will be the basis of the writeback iterator. To make this work, we need to remember the current index and end positions in writeback_control. [hch@lst.de: heavily rebased, add helpers to get the tag and end index, don't keep the end index in struct writeback_control] Link: https://lkml.kernel.org/r/20240215063649.2164017-9-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/writeback.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 4b8cf9e4810b..f67b3ea866a0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,6 +11,7 @@ #include #include #include +#include struct bio; @@ -40,6 +41,7 @@ enum writeback_sync_modes { * in a manner such that unspecified fields are set to zero. */ struct writeback_control { + /* public fields that can be set and/or consumed by the caller: */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ @@ -77,6 +79,10 @@ struct writeback_control { */ struct swap_iocb **swap_plug; + /* internal fields used by the ->writepages implementation: */ + struct folio_batch fbatch; + pgoff_t index; + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ -- cgit v1.2.3 From 535c5d9dadb327bb35f6d780fa037e0d3dfcb568 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Feb 2024 07:36:45 +0100 Subject: pagevec: add ability to iterate a queue Add a loop counter inside the folio_batch to let us iterate from 0-nr instead of decrementing nr and treating the batch as a stack. It would generate some very weird and suboptimal I/O patterns for page writeback to iterate over the batch as a stack. Link: https://lkml.kernel.org/r/20240215063649.2164017-11-hch@lst.de Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Acked-by: Dave Chinner Cc: Christian Brauner Cc: David Howells Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 87cc678adc85..fcc06c300a72 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -27,6 +27,7 @@ struct folio; */ struct folio_batch { unsigned char nr; + unsigned char i; bool percpu_pvec_drained; struct folio *folios[PAGEVEC_SIZE]; }; @@ -40,12 +41,14 @@ struct folio_batch { static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->i = 0; fbatch->percpu_pvec_drained = false; } static inline void folio_batch_reinit(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->i = 0; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) @@ -75,6 +78,21 @@ static inline unsigned folio_batch_add(struct folio_batch *fbatch, return folio_batch_space(fbatch); } +/** + * folio_batch_next - Return the next folio to process. + * @fbatch: The folio batch being processed. + * + * Use this function to implement a queue of folios. + * + * Return: The next folio in the queue, or NULL if the queue is empty. + */ +static inline struct folio *folio_batch_next(struct folio_batch *fbatch) +{ + if (fbatch->i == fbatch->nr) + return NULL; + return fbatch->folios[fbatch->i++]; +} + void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) -- cgit v1.2.3 From cdc150b575cf1176472791cfbe7738708812ea0d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 07:36:48 +0100 Subject: writeback: add a writeback iterator Refactor the code left in write_cache_pages into an iterator that the file system can call to get the next folio for a writeback operation: struct folio *folio = NULL; while ((folio = writeback_iter(mapping, wbc, folio, &error))) { error = ; } The twist here is that the error value is passed by reference, so that the iterator can restore it when breaking out of the loop. Handling of the magic AOP_WRITEPAGE_ACTIVATE value stays outside the iterator and needs is just kept in the write_cache_pages legacy wrapper. in preparation for eventually killing it off. Heavily based on a for_each* based iterator from Matthew Wilcox. Link: https://lkml.kernel.org/r/20240215063649.2164017-14-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Jan Kara Cc: Christian Brauner Cc: Dave Chinner Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- include/linux/writeback.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f67b3ea866a0..9845cb62e40b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -82,6 +82,7 @@ struct writeback_control { /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; + int saved_err; #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ @@ -366,6 +367,9 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool wb_over_bg_thresh(struct bdi_writeback *wb); +struct folio *writeback_iter(struct address_space *mapping, + struct writeback_control *wbc, struct folio *folio, int *error); + typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); -- cgit v1.2.3 From 88debc69754f7fe5186954941bb1cc4d744f4f25 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 22 Feb 2024 16:34:15 +0100 Subject: cpufreq: Remove references to 10ms min sampling rate A minimum sampling rate value of 10ms was introduced in: commit cef9615a853e ("[CPUFREQ] ondemand: Uncouple minimal sampling rate from HZ in NO_HZ case") The use of this value was removed in: commit ed4676e25463 ("cpufreq: Replace "max_transition_latency" with "dynamic_switching"") Remove: - a comment referencing this value - an unused macro associated to this value Signed-off-by: Pierre Gondois Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9bebeec24abb..85908b3a2f24 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -569,9 +569,7 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, /* * The polling frequency depends on the capability of the processor. Default - * polling frequency is 1000 times the transition latency of the processor. The - * ondemand governor will work on any processor with transition latency <= 10ms, - * using appropriate sampling rate. + * polling frequency is 1000 times the transition latency of the processor. */ #define LATENCY_MULTIPLIER (1000) -- cgit v1.2.3 From 5c656fcdd6c60f71fccb07fe7b9d8d7e6c9811ff Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sat, 24 Feb 2024 20:35:44 +0530 Subject: mm: Introduce new flag to indicate wc safe The VM_ALLOW_ANY_UNCACHED flag is implemented for ARM64, allowing KVM stage 2 device mapping attributes to use NormalNC rather than DEVICE_nGnRE, which allows guest mappings supporting write-combining attributes (WC). ARM does not architecturally guarantee this is safe, and indeed some MMIO regions like the GICv2 VCPU interface can trigger uncontained faults if NormalNC is used. Even worse, the expectation is that there are platforms where even DEVICE_nGnRE can allow uncontained faults in corner cases. Unfortunately existing ARM IP requires platform integration to take responsibility to prevent this. To safely use VFIO in KVM the platform must guarantee full safety in the guest where no action taken against a MMIO mapping can trigger an uncontained failure. The assumption is that most VFIO PCI platforms support this for both mapping types, at least in common flows, based on some expectations of how PCI IP is integrated. This can be enabled more broadly, for instance into vfio-platform drivers, but only after the platform vendor completes auditing for safety. The VMA flag VM_ALLOW_ANY_UNCACHED was found to be the simplest and cleanest way to communicate the information from VFIO to KVM that mapping the region in S2 as NormalNC is safe. KVM consumes it to activate the code that does the S2 mapping as NormalNC. Suggested-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Reviewed-by: Marc Zyngier Acked-by: David Hildenbrand Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20240224150546.368-3-ankita@nvidia.com Signed-off-by: Oliver Upton --- include/linux/mm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..59576e56c58b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -391,6 +391,20 @@ extern unsigned int kobjsize(const void *objp); # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +/* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED_BIT 39 +#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) -- cgit v1.2.3 From b361c9027b4e4159e7bcca4eb64fd26507c19994 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Fri, 23 Feb 2024 15:57:48 +0000 Subject: sched: Add a new function to compare if two cpus have the same capacity The new helper function is needed to help blk-mq check if it needs to dispatch the softirq on another CPU to match the performance level the IO requester is running at. This is important on HMP systems where not all CPUs have the same compute capacity. Signed-off-by: Qais Yousef Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240223155749.2958009-2-qyousef@layalina.io Signed-off-by: Jens Axboe --- include/linux/sched/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a6e04b4a21d7..11e0e00e0bb9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -176,6 +176,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); +bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); @@ -226,6 +227,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], { } +static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + return true; +} + static inline bool cpus_share_cache(int this_cpu, int that_cpu) { return true; -- cgit v1.2.3 From c1b967d03c5d570ed7b90a88031fa2af34bf5b20 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Sep 2023 22:11:26 -0400 Subject: nfs: fix UAF on pathwalk running into umount NFS ->d_revalidate(), ->permission() and ->get_link() need to access some parts of nfs_server when called in RCU mode: server->flags server->caps *(server->io_stats) and, worst of all, call server->nfs_client->rpc_ops->have_delegation (the last one - as NFS_PROTO(inode)->have_delegation()). We really don't want to RCU-delay the entire nfs_free_server() (it would have to be done with schedule_work() from RCU callback, since it can't be made to run from interrupt context), but actual freeing of nfs_server and ->io_stats can be done via call_rcu() just fine. nfs_client part is handled simply by making nfs_free_client() use kfree_rcu(). Acked-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/nfs_fs_sb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index cd797e00fe35..92de074e63b9 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -124,6 +124,7 @@ struct nfs_client { char cl_ipaddr[48]; struct net *cl_net; struct list_head pending_cb_stateids; + struct rcu_head rcu; }; /* @@ -265,6 +266,7 @@ struct nfs_server { const struct cred *cred; bool has_sec_mnt_opts; struct kobject kobj; + struct rcu_head rcu; }; /* Server capabilities */ -- cgit v1.2.3 From e31f0a57ae1ab2f6e17adb8e602bc120ad722232 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Sep 2023 00:12:00 -0400 Subject: procfs: make freeing proc_fs_info rcu-delayed makes proc_pid_ns() safe from rcu pathwalk (put_pid_ns() is still synchronous, but that's not a problem - it does rcu-delay everything that needs to be) Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/proc_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index de407e7c3b55..0b2a89854440 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -65,6 +65,7 @@ struct proc_fs_info { kgid_t pid_gid; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; + struct rcu_head rcu; }; static inline struct proc_fs_info *proc_sb_info(struct super_block *sb) -- cgit v1.2.3 From 5270316c9fec8cc99aa0e0a258509c5c7f789d12 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Thu, 22 Feb 2024 14:35:00 +0100 Subject: kbuild: Use -fmin-function-alignment when available GCC recently added option -fmin-function-alignment, which should appear in GCC 14. Unlike -falign-functions, this option causes all functions to be aligned at the specified value, including the cold ones. In particular, when an arm64 kernel is built with DYNAMIC_FTRACE_WITH_CALL_OPS=y, the 8-byte function alignment is required for correct functionality. This was done by -falign-functions=8 and having workarounds in the kernel to force the compiler to follow this alignment. The new -fmin-function-alignment option directly guarantees it. Detect availability of -fmin-function-alignment and use it instead of -falign-functions when present. Introduce CC_HAS_SANE_FUNCTION_ALIGNMENT and enable __cold to work as expected when it is set. Signed-off-by: Petr Pavlu Reviewed-by: Nathan Chancellor Acked-by: Mark Rutland Signed-off-by: Masahiro Yamada --- include/linux/compiler_types.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 0caf354cb94b..fb8888678687 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -99,17 +99,17 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } * gcc: https://gcc.gnu.org/onlinedocs/gcc/Label-Attributes.html#index-cold-label-attribute * * When -falign-functions=N is in use, we must avoid the cold attribute as - * contemporary versions of GCC drop the alignment for cold functions. Worse, - * GCC can implicitly mark callees of cold functions as cold themselves, so - * it's not sufficient to add __function_aligned here as that will not ensure - * that callees are correctly aligned. + * GCC drops the alignment for cold functions. Worse, GCC can implicitly mark + * callees of cold functions as cold themselves, so it's not sufficient to add + * __function_aligned here as that will not ensure that callees are correctly + * aligned. * * See: * * https://lore.kernel.org/lkml/Y77%2FqVgvaJidFpYt@FVFF77S0Q05N * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c9 */ -#if !defined(CONFIG_CC_IS_GCC) || (CONFIG_FUNCTION_ALIGNMENT == 0) +#if defined(CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT) || (CONFIG_FUNCTION_ALIGNMENT == 0) #define __cold __attribute__((__cold__)) #else #define __cold -- cgit v1.2.3 From 0b7c6075022ccff529318597dc6b165dd6a25c8f Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Tue, 20 Feb 2024 22:06:12 +0000 Subject: soc: samsung: exynos-pmu: Add regmap support for SoCs that protect PMU regs Some Exynos based SoCs like Tensor gs101 protect the PMU registers for security hardening reasons so that they are only write accessible in el3 via an SMC call. As most Exynos drivers that need to write PMU registers currently obtain a regmap via syscon (phys, pinctrl, watchdog). Support for the above usecase is implemented in this driver using a custom regmap similar to syscon to handle the SMC call. Platforms that don't secure PMU registers, get a mmio regmap like before. As regmaps abstract out the underlying register access changes to the leaf drivers are minimal. A new API exynos_get_pmu_regmap_by_phandle() is provided for leaf drivers that currently use syscon_regmap_lookup_by_phandle(). This also handles deferred probing. Tested-by: Sam Protsenko Tested-by: Alexey Klimov Reviewed-by: Sam Protsenko Signed-off-by: Peter Griffin Link: https://lore.kernel.org/r/20240220220613.797068-2-peter.griffin@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/exynos-pmu.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/exynos-pmu.h b/include/linux/soc/samsung/exynos-pmu.h index a4f5516cc956..2bd9d12d9a52 100644 --- a/include/linux/soc/samsung/exynos-pmu.h +++ b/include/linux/soc/samsung/exynos-pmu.h @@ -10,6 +10,7 @@ #define __LINUX_SOC_EXYNOS_PMU_H struct regmap; +struct device_node; enum sys_powerdown { SYS_AFTR, @@ -20,12 +21,20 @@ enum sys_powerdown { extern void exynos_sys_powerdown_conf(enum sys_powerdown mode); #ifdef CONFIG_EXYNOS_PMU -extern struct regmap *exynos_get_pmu_regmap(void); +struct regmap *exynos_get_pmu_regmap(void); +struct regmap *exynos_get_pmu_regmap_by_phandle(struct device_node *np, + const char *propname); #else static inline struct regmap *exynos_get_pmu_regmap(void) { return ERR_PTR(-ENODEV); } + +static inline struct regmap *exynos_get_pmu_regmap_by_phandle(struct device_node *np, + const char *propname) +{ + return ERR_PTR(-ENODEV); +} #endif #endif /* __LINUX_SOC_EXYNOS_PMU_H */ -- cgit v1.2.3 From bac0a9e56e1fd14b227ab57142eca6f7bc6e6115 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 8 Feb 2024 19:10:45 +0100 Subject: file: add alloc_file_pseudo_noaccount() When we open block devices as files we want to make sure to not charge them against the open file limit of the caller as that can cause spurious failures. Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-1-adbd023e19cc@kernel.org Signed-off-by: Christian Brauner --- include/linux/file.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 6834a29338c4..169692cb1906 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -24,6 +24,8 @@ struct inode; struct path; extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); +extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *, + const char *, int flags, const struct file_operations *); extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); -- cgit v1.2.3 From f3a608827d1f8de0dd12813e8d9c6803fe64e119 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 8 Feb 2024 18:47:35 +0100 Subject: bdev: open block device as files Add two new helpers to allow opening block devices as files. This is not the final infrastructure. This still opens the block device before opening a struct a file. Until we have removed all references to struct bdev_handle we can't switch the order: * Introduce blk_to_file_flags() to translate from block specific to flags usable to pen a new file. * Introduce bdev_file_open_by_{dev,path}(). * Introduce temporary sb_bdev_handle() helper to retrieve a struct bdev_handle from a block device file and update places that directly reference struct bdev_handle to rely on it. * Don't count block device openes against the number of open files. A bdev_file_open_by_{dev,path}() file is never installed into any file descriptor table. One idea that came to mind was to use kernel_tmpfile_open() which would require us to pass a path and it would then call do_dentry_open() going through the regular fops->open::blkdev_open() path. But then we're back to the problem of routing block specific flags such as BLK_OPEN_RESTRICT_WRITES through the open path and would have to waste FMODE_* flags every time we add a new one. With this we can avoid using a flag bit and we have more leeway in how we open block devices from bdev_open_by_{dev,path}(). Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-1-adbd023e19cc@kernel.org Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 7 +++++++ include/linux/fs.h | 10 ++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99e4f5e72213..76706aa47316 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include #include #include +#include struct module; struct request_queue; @@ -1474,6 +1475,7 @@ extern const struct blk_holder_ops fs_holder_ops; (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) +/* @bdev_handle will be removed soon. */ struct bdev_handle { struct block_device *bdev; void *holder; @@ -1484,6 +1486,10 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); +struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops); +struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hops); int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); @@ -1494,6 +1500,7 @@ struct block_device *blkdev_get_no_open(dev_t dev); void blkdev_put_no_open(struct block_device *bdev); struct block_device *I_BDEV(struct inode *inode); +struct block_device *file_bdev(struct file *bdev_file); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..e9291e27cc47 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1228,8 +1228,8 @@ struct super_block { #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ - struct block_device *s_bdev; - struct bdev_handle *s_bdev_handle; + struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ + struct file *s_bdev_file; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; @@ -1327,6 +1327,12 @@ struct super_block { struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; +/* Temporary helper that will go away. */ +static inline struct bdev_handle *sb_bdev_handle(struct super_block *sb) +{ + return sb->s_bdev_file->private_data; +} + static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; -- cgit v1.2.3 From a28d893eb3270cf62c10dd8777af0d8452cdc072 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:21 +0100 Subject: md: port block device access to file Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-4-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/device-mapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 772ab4d74d94..82b2195efaca 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -165,7 +165,7 @@ void dm_error(const char *message); struct dm_dev { struct block_device *bdev; - struct bdev_handle *bdev_handle; + struct file *bdev_file; struct dax_device *dax_dev; blk_mode_t mode; char name[16]; -- cgit v1.2.3 From 16ca5dfd8dcd2816edc8f4e68ac4b9c5a606fc98 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:22 +0100 Subject: swap: port block device usage to file Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-5-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4db00ddad261..e5b82bc05e60 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -298,7 +298,7 @@ struct swap_info_struct { unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ - struct bdev_handle *bdev_handle;/* open handle of the bdev */ + struct file *bdev_file; /* open handle of the bdev */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ -- cgit v1.2.3 From 05fb1dbc821f3016a52621ccd4530c269b626130 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:26 +0100 Subject: pktcdvd: port block device access to file Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-9-adbd023e19cc@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pktcdvd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 79594aeb160d..2f1b952d596a 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -154,9 +154,9 @@ struct packet_stacked_data struct pktcdvd_device { - struct bdev_handle *bdev_handle; /* dev attached */ + struct file *bdev_file; /* dev attached */ /* handle acquired for bdev during pkt_open_dev() */ - struct bdev_handle *open_bdev_handle; + struct file *f_open_bdev; dev_t pkt_dev; /* our dev */ struct packet_settings settings; struct packet_stats stats; -- cgit v1.2.3 From e97d06a46526d9392cbdbd7eda193091e1af2723 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:44 +0100 Subject: bdev: remove bdev_open_by_path() Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-27-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 76706aa47316..5880d5abfebe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1484,8 +1484,6 @@ struct bdev_handle { struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); -struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, -- cgit v1.2.3 From b1211a25c4fe3443cfef4ed7c39251502a663776 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:45 +0100 Subject: bdev: make bdev_{release, open_by_dev}() private to block layer Move both of them to the private block header. There's no caller in the tree anymore that uses them directly. Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-28-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5880d5abfebe..495f55587207 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1482,8 +1482,6 @@ struct bdev_handle { blk_mode_t mode; }; -struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops); struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, @@ -1491,7 +1489,6 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); -void bdev_release(struct bdev_handle *handle); /* just for blk-cgroup, don't use elsewhere */ struct block_device *blkdev_get_no_open(dev_t dev); -- cgit v1.2.3 From a56aefca8d386181415a1fb7cfec2f72b0404797 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 23 Jan 2024 14:26:46 +0100 Subject: bdev: make struct bdev_handle private to the block layer Link: https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-29-adbd023e19cc@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 7 ------- include/linux/fs.h | 6 ------ 2 files changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 495f55587207..2f5dbde23094 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1475,13 +1475,6 @@ extern const struct blk_holder_ops fs_holder_ops; (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) -/* @bdev_handle will be removed soon. */ -struct bdev_handle { - struct block_device *bdev; - void *holder; - blk_mode_t mode; -}; - struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, diff --git a/include/linux/fs.h b/include/linux/fs.h index e9291e27cc47..6e0714d35d9b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1327,12 +1327,6 @@ struct super_block { struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; -/* Temporary helper that will go away. */ -static inline struct bdev_handle *sb_bdev_handle(struct super_block *sb) -{ - return sb->s_bdev_file->private_data; -} - static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; -- cgit v1.2.3 From 5f85c4d10ef46a5f457a9d337159ff620a0a2191 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Tue, 30 Jan 2024 22:06:45 +0100 Subject: hwmon: (core) Add support for humidity min/max alarm Add min_alarm and max_alarm attributes for humidityX to support devices that can generate these alarms. Such attributes already exist for other magnitudes such as tempX. Tested with a ChipCap 2 temperature-humidity sensor. Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20240130-topic-chipcap2-v6-2-260bea05cf9b@gmail.com Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index c7885fdce88f..edf96f249eb5 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -295,6 +295,8 @@ enum hwmon_humidity_attributes { hwmon_humidity_fault, hwmon_humidity_rated_min, hwmon_humidity_rated_max, + hwmon_humidity_min_alarm, + hwmon_humidity_max_alarm, }; #define HWMON_H_ENABLE BIT(hwmon_humidity_enable) @@ -308,6 +310,8 @@ enum hwmon_humidity_attributes { #define HWMON_H_FAULT BIT(hwmon_humidity_fault) #define HWMON_H_RATED_MIN BIT(hwmon_humidity_rated_min) #define HWMON_H_RATED_MAX BIT(hwmon_humidity_rated_max) +#define HWMON_H_MIN_ALARM BIT(hwmon_humidity_min_alarm) +#define HWMON_H_MAX_ALARM BIT(hwmon_humidity_max_alarm) enum hwmon_fan_attributes { hwmon_fan_enable, -- cgit v1.2.3 From 7e84c961b2eb062d2f47037dcca52dcd1d3615b5 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:24 +0000 Subject: mtd: ubi: introduce pre-removal notification for UBI volumes Introduce a new notification type UBI_VOLUME_SHUTDOWN to inform users that a volume is just about to be removed. This is needed because users (such as the NVMEM subsystem) expect that at the time their removal function is called, the parenting device is still available (for removal of sysfs nodes, for example, in case of NVMEM which otherwise WARNs on volume removal). Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- include/linux/mtd/ubi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h index a529347fd75b..562f92504f2b 100644 --- a/include/linux/mtd/ubi.h +++ b/include/linux/mtd/ubi.h @@ -192,6 +192,7 @@ struct ubi_device_info { * or a volume was removed) * @UBI_VOLUME_RESIZED: a volume has been re-sized * @UBI_VOLUME_RENAMED: a volume has been re-named + * @UBI_VOLUME_SHUTDOWN: a volume is going to removed, shutdown users * @UBI_VOLUME_UPDATED: data has been written to a volume * * These constants define which type of event has happened when a volume @@ -202,6 +203,7 @@ enum { UBI_VOLUME_REMOVED, UBI_VOLUME_RESIZED, UBI_VOLUME_RENAMED, + UBI_VOLUME_SHUTDOWN, UBI_VOLUME_UPDATED, }; -- cgit v1.2.3 From bfe93930ea1ea3c6c115a7d44af6e4fea609067e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Feb 2024 13:08:22 -0800 Subject: rcu-tasks: Add data to eliminate RCU-tasks/do_exit() deadlocks Holding a mutex across synchronize_rcu_tasks() and acquiring that same mutex in code called from do_exit() after its call to exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop() results in deadlock. This is by design, because tasks that are far enough into do_exit() are no longer present on the tasks list, making it a bit difficult for RCU Tasks to find them, let alone wait on them to do a voluntary context switch. However, such deadlocks are becoming more frequent. In addition, lockdep currently does not detect such deadlocks and they can be difficult to reproduce. In addition, if a task voluntarily context switches during that time (for example, if it blocks acquiring a mutex), then this task is in an RCU Tasks quiescent state. And with some adjustments, RCU Tasks could just as well take advantage of that fact. This commit therefore adds the data structures that will be needed to rely on these quiescent states and to eliminate these deadlocks. Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/ Reported-by: Chen Zhongjin Reported-by: Yang Jihong Signed-off-by: Paul E. McKenney Tested-by: Yang Jihong Tested-by: Chen Zhongjin Reviewed-by: Frederic Weisbecker Signed-off-by: Boqun Feng --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..5eeebed2dd9b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -858,6 +858,8 @@ struct task_struct { u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; + int rcu_tasks_exit_cpu; + struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU -- cgit v1.2.3 From 0dc5b8abfa03e8720cb341699e3ece194058bb03 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 20 Feb 2024 08:22:13 +0100 Subject: interconnect: constify of_phandle_args in xlate The xlate callbacks are supposed to translate of_phandle_args to proper provider without modifying the of_phandle_args. Make the argument pointer to const for code safety and readability. Acked-by: Konrad Dybcio Acked-by: Thierry Reding # Tegra Signed-off-by: Krzysztof Kozlowski Acked-by: Alim Akhtar # Samsung Link: https://lore.kernel.org/r/20240220072213.35779-1-krzysztof.kozlowski@linaro.org Signed-off-by: Georgi Djakov --- include/linux/interconnect-provider.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h index 7ba183f221f1..f5aef8784692 100644 --- a/include/linux/interconnect-provider.h +++ b/include/linux/interconnect-provider.h @@ -36,7 +36,7 @@ struct icc_onecell_data { struct icc_node *nodes[] __counted_by(num_nodes); }; -struct icc_node *of_icc_xlate_onecell(struct of_phandle_args *spec, +struct icc_node *of_icc_xlate_onecell(const struct of_phandle_args *spec, void *data); /** @@ -65,8 +65,9 @@ struct icc_provider { u32 peak_bw, u32 *agg_avg, u32 *agg_peak); void (*pre_aggregate)(struct icc_node *node); int (*get_bw)(struct icc_node *node, u32 *avg, u32 *peak); - struct icc_node* (*xlate)(struct of_phandle_args *spec, void *data); - struct icc_node_data* (*xlate_extended)(struct of_phandle_args *spec, void *data); + struct icc_node* (*xlate)(const struct of_phandle_args *spec, void *data); + struct icc_node_data* (*xlate_extended)(const struct of_phandle_args *spec, + void *data); struct device *dev; int users; bool inter_set; @@ -124,7 +125,7 @@ int icc_nodes_remove(struct icc_provider *provider); void icc_provider_init(struct icc_provider *provider); int icc_provider_register(struct icc_provider *provider); void icc_provider_deregister(struct icc_provider *provider); -struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec); +struct icc_node_data *of_icc_get_from_provider(const struct of_phandle_args *spec); void icc_sync_state(struct device *dev); #else @@ -171,7 +172,7 @@ static inline int icc_provider_register(struct icc_provider *provider) static inline void icc_provider_deregister(struct icc_provider *provider) { } -static inline struct icc_node_data *of_icc_get_from_provider(struct of_phandle_args *spec) +static inline struct icc_node_data *of_icc_get_from_provider(const struct of_phandle_args *spec) { return ERR_PTR(-ENOTSUPP); } -- cgit v1.2.3 From b8a730836c6b1788ca2fbd6bcc2ac99e97ef7de9 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Mon, 19 Feb 2024 09:45:50 -0300 Subject: thunderbolt: Constify the struct device_type usage Since commit aed65af1cc2f ("drivers: make device_type const"), the driver core can properly handle constant struct device_type. Move the tb_domain_type, tb_retimer_type, tb_switch_type, usb4_port_device_type, tb_service_type and tb_xdomain_type variables to be constant structures as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Signed-off-by: Mika Westerberg --- include/linux/thunderbolt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 2c835e5c41f6..4338ea9ac4fd 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -87,8 +87,8 @@ struct tb { }; extern const struct bus_type tb_bus_type; -extern struct device_type tb_service_type; -extern struct device_type tb_xdomain_type; +extern const struct device_type tb_service_type; +extern const struct device_type tb_xdomain_type; #define TB_LINKS_PER_PHY_PORT 2 -- cgit v1.2.3 From cdeeaaba174886aa6c1ff4c0c5449c5066dbe82f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Feb 2024 19:27:17 +0100 Subject: mm, slab: deprecate SLAB_MEM_SPREAD flag The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed. SLUB instead relies on the page allocator's NUMA policies. Change the flag's value to 0 to free up the value it had, and mark it for full removal once all users are gone. Reported-by: Steven Rostedt Closes: https://lore.kernel.org/all/20240131172027.10f64405@gandalf.local.home/ Reviewed-and-tested-by: Xiongwei Song Reviewed-by: Chengming Zhou Reviewed-by: Roman Gushchin Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index b5f5ee8308d0..b1675ff6b904 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -96,8 +96,6 @@ */ /* Defer freeing slabs to RCU */ #define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U) -/* Spread some memory over cpuset */ -#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000U) /* Trace allocations and frees */ #define SLAB_TRACE ((slab_flags_t __force)0x00200000U) @@ -164,6 +162,9 @@ #endif #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ +/* Obsolete unused flag, to be removed */ +#define SLAB_MEM_SPREAD ((slab_flags_t __force)0U) + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * -- cgit v1.2.3 From cc61eb851c9ae38546d7df6076fd883d3dbc322d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Feb 2024 19:27:18 +0100 Subject: mm, slab: use an enum to define SLAB_ cache creation flags The values of SLAB_ cache creation flags are defined by hand, which is tedious and error-prone. Use an enum to assign the bit number and a __SLAB_FLAG_BIT() macro to #define the final flags. This renumbers the flag values, which is OK as they are only used internally. Also define a __SLAB_FLAG_UNUSED macro to assign value to flags disabled by their respective config options in a unified and sparse-friendly way. Reviewed-and-tested-by: Xiongwei Song Reviewed-by: Chengming Zhou Reviewed-by: Roman Gushchin Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 94 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index b1675ff6b904..f6323763cd61 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -21,29 +21,69 @@ #include #include +enum _slab_flag_bits { + _SLAB_CONSISTENCY_CHECKS, + _SLAB_RED_ZONE, + _SLAB_POISON, + _SLAB_KMALLOC, + _SLAB_HWCACHE_ALIGN, + _SLAB_CACHE_DMA, + _SLAB_CACHE_DMA32, + _SLAB_STORE_USER, + _SLAB_PANIC, + _SLAB_TYPESAFE_BY_RCU, + _SLAB_TRACE, +#ifdef CONFIG_DEBUG_OBJECTS + _SLAB_DEBUG_OBJECTS, +#endif + _SLAB_NOLEAKTRACE, + _SLAB_NO_MERGE, +#ifdef CONFIG_FAILSLAB + _SLAB_FAILSLAB, +#endif +#ifdef CONFIG_MEMCG_KMEM + _SLAB_ACCOUNT, +#endif +#ifdef CONFIG_KASAN_GENERIC + _SLAB_KASAN, +#endif + _SLAB_NO_USER_FLAGS, +#ifdef CONFIG_KFENCE + _SLAB_SKIP_KFENCE, +#endif +#ifndef CONFIG_SLUB_TINY + _SLAB_RECLAIM_ACCOUNT, +#endif + _SLAB_OBJECT_POISON, + _SLAB_CMPXCHG_DOUBLE, + _SLAB_FLAGS_LAST_BIT +}; + +#define __SLAB_FLAG_BIT(nr) ((slab_flags_t __force)(1U << (nr))) +#define __SLAB_FLAG_UNUSED ((slab_flags_t __force)(0U)) /* * Flags to pass to kmem_cache_create(). * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op */ /* DEBUG: Perform (expensive) checks on alloc/free */ -#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) +#define SLAB_CONSISTENCY_CHECKS __SLAB_FLAG_BIT(_SLAB_CONSISTENCY_CHECKS) /* DEBUG: Red zone objs in a cache */ -#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) +#define SLAB_RED_ZONE __SLAB_FLAG_BIT(_SLAB_RED_ZONE) /* DEBUG: Poison objects */ -#define SLAB_POISON ((slab_flags_t __force)0x00000800U) +#define SLAB_POISON __SLAB_FLAG_BIT(_SLAB_POISON) /* Indicate a kmalloc slab */ -#define SLAB_KMALLOC ((slab_flags_t __force)0x00001000U) +#define SLAB_KMALLOC __SLAB_FLAG_BIT(_SLAB_KMALLOC) /* Align objs on cache lines */ -#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) +#define SLAB_HWCACHE_ALIGN __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN) /* Use GFP_DMA memory */ -#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000U) +#define SLAB_CACHE_DMA __SLAB_FLAG_BIT(_SLAB_CACHE_DMA) /* Use GFP_DMA32 memory */ -#define SLAB_CACHE_DMA32 ((slab_flags_t __force)0x00008000U) +#define SLAB_CACHE_DMA32 __SLAB_FLAG_BIT(_SLAB_CACHE_DMA32) /* DEBUG: Store the last owner for bug hunting */ -#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000U) +#define SLAB_STORE_USER __SLAB_FLAG_BIT(_SLAB_STORE_USER) /* Panic if kmem_cache_create() fails */ -#define SLAB_PANIC ((slab_flags_t __force)0x00040000U) +#define SLAB_PANIC __SLAB_FLAG_BIT(_SLAB_PANIC) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * @@ -95,19 +135,19 @@ * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ /* Defer freeing slabs to RCU */ -#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U) +#define SLAB_TYPESAFE_BY_RCU __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU) /* Trace allocations and frees */ -#define SLAB_TRACE ((slab_flags_t __force)0x00200000U) +#define SLAB_TRACE __SLAB_FLAG_BIT(_SLAB_TRACE) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS -# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000U) +# define SLAB_DEBUG_OBJECTS __SLAB_FLAG_BIT(_SLAB_DEBUG_OBJECTS) #else -# define SLAB_DEBUG_OBJECTS 0 +# define SLAB_DEBUG_OBJECTS __SLAB_FLAG_UNUSED #endif /* Avoid kmemleak tracing */ -#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U) +#define SLAB_NOLEAKTRACE __SLAB_FLAG_BIT(_SLAB_NOLEAKTRACE) /* * Prevent merging with compatible kmem caches. This flag should be used @@ -119,25 +159,25 @@ * - performance critical caches, should be very rare and consulted with slab * maintainers, and not used together with CONFIG_SLUB_TINY */ -#define SLAB_NO_MERGE ((slab_flags_t __force)0x01000000U) +#define SLAB_NO_MERGE __SLAB_FLAG_BIT(_SLAB_NO_MERGE) /* Fault injection mark */ #ifdef CONFIG_FAILSLAB -# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U) +# define SLAB_FAILSLAB __SLAB_FLAG_BIT(_SLAB_FAILSLAB) #else -# define SLAB_FAILSLAB 0 +# define SLAB_FAILSLAB __SLAB_FLAG_UNUSED #endif /* Account to memcg */ #ifdef CONFIG_MEMCG_KMEM -# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U) +# define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) #else -# define SLAB_ACCOUNT 0 +# define SLAB_ACCOUNT __SLAB_FLAG_UNUSED #endif #ifdef CONFIG_KASAN_GENERIC -#define SLAB_KASAN ((slab_flags_t __force)0x08000000U) +#define SLAB_KASAN __SLAB_FLAG_BIT(_SLAB_KASAN) #else -#define SLAB_KASAN 0 +#define SLAB_KASAN __SLAB_FLAG_UNUSED #endif /* @@ -145,25 +185,25 @@ * Intended for caches created for self-tests so they have only flags * specified in the code and other flags are ignored. */ -#define SLAB_NO_USER_FLAGS ((slab_flags_t __force)0x10000000U) +#define SLAB_NO_USER_FLAGS __SLAB_FLAG_BIT(_SLAB_NO_USER_FLAGS) #ifdef CONFIG_KFENCE -#define SLAB_SKIP_KFENCE ((slab_flags_t __force)0x20000000U) +#define SLAB_SKIP_KFENCE __SLAB_FLAG_BIT(_SLAB_SKIP_KFENCE) #else -#define SLAB_SKIP_KFENCE 0 +#define SLAB_SKIP_KFENCE __SLAB_FLAG_UNUSED #endif /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ #ifndef CONFIG_SLUB_TINY -#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) +#define SLAB_RECLAIM_ACCOUNT __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT) #else -#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0) +#define SLAB_RECLAIM_ACCOUNT __SLAB_FLAG_UNUSED #endif #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* Obsolete unused flag, to be removed */ -#define SLAB_MEM_SPREAD ((slab_flags_t __force)0U) +#define SLAB_MEM_SPREAD __SLAB_FLAG_UNUSED /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. -- cgit v1.2.3 From 96d8dbb6f65041b670a79e8ae76f67cc11dee203 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Feb 2024 19:27:19 +0100 Subject: mm, slab, kasan: replace kasan_never_merge() with SLAB_NO_MERGE The SLAB_KASAN flag prevents merging of caches in some configurations, which is handled in a rather complicated way via kasan_never_merge(). Since we now have a generic SLAB_NO_MERGE flag, we can instead use it for KASAN caches in addition to SLAB_KASAN in those configurations, and simplify the SLAB_NEVER_MERGE handling. Tested-by: Xiongwei Song Reviewed-by: Chengming Zhou Reviewed-by: Andrey Konovalov Tested-by: David Rientjes Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index dbb06d789e74..70d6a8f6e25d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -429,7 +429,6 @@ struct kasan_cache { }; size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); -slab_flags_t kasan_never_merge(void); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags); @@ -446,11 +445,6 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache, { return 0; } -/* And thus nothing prevents cache merging. */ -static inline slab_flags_t kasan_never_merge(void) -{ - return 0; -} /* And no cache-related metadata initialization is required. */ static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, -- cgit v1.2.3 From f1d00496a15bcdfe54bdb54cf1e7f207d6ad9b79 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Feb 2024 21:21:13 +0100 Subject: mtd: lpc32xx: use typedef for dma_filter_fn Use existing typedef for dma_filter_fn to avoid duplicating type definition. Signed-off-by: Krzysztof Kozlowski Acked-by: Vladimir Zapolskiy Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240208202113.630190-1-krzysztof.kozlowski@linaro.org --- include/linux/mtd/lpc32xx_mlc.h | 2 +- include/linux/mtd/lpc32xx_slc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/lpc32xx_mlc.h b/include/linux/mtd/lpc32xx_mlc.h index d168c628c0d5..35e971be0950 100644 --- a/include/linux/mtd/lpc32xx_mlc.h +++ b/include/linux/mtd/lpc32xx_mlc.h @@ -11,7 +11,7 @@ #include struct lpc32xx_mlc_platform_data { - bool (*dma_filter)(struct dma_chan *chan, void *filter_param); + dma_filter_fn dma_filter; }; #endif /* __LINUX_MTD_LPC32XX_MLC_H */ diff --git a/include/linux/mtd/lpc32xx_slc.h b/include/linux/mtd/lpc32xx_slc.h index cf54a9f80460..a044b806566b 100644 --- a/include/linux/mtd/lpc32xx_slc.h +++ b/include/linux/mtd/lpc32xx_slc.h @@ -11,7 +11,7 @@ #include struct lpc32xx_slc_platform_data { - bool (*dma_filter)(struct dma_chan *chan, void *filter_param); + dma_filter_fn dma_filter; }; #endif /* __LINUX_MTD_LPC32XX_SLC_H */ -- cgit v1.2.3 From 081064cc103929c8a0d7d5fd057110c739c62ee1 Mon Sep 17 00:00:00 2001 From: Marcel Hamer Date: Fri, 16 Feb 2024 16:50:22 +0100 Subject: mtd: fix minor comment typo for struct mtd_master Minor typo in the suspend description. Signed-off-by: Marcel Hamer Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240216155022.79371-1-marcel.hamer@windriver.com --- include/linux/mtd/mtd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 914a9f974baa..8d10d9d2e830 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -223,7 +223,7 @@ struct mtd_part { * @partitions_lock: lock protecting accesses to the partition list. Protects * not only the master partition list, but also all * sub-partitions. - * @suspended: et to 1 when the device is suspended, 0 otherwise + * @suspended: set to 1 when the device is suspended, 0 otherwise * * This struct is embedded in mtd_info and contains master-specific * properties/fields. The master is the root MTD device from the MTD partition -- cgit v1.2.3 From 3ad6eb0683a1edbb4bb117b85d61f17a879155a1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 25 Feb 2024 23:54:59 +0100 Subject: tick: Start centralizing tick related CPU hotplug operations During the CPU offlining process, the various timer tick features are shut down from scattered places, sometimes from teardown callbacks on stop machine, sometimes through explicit calls, sometimes from the control CPU after the CPU died. The reason why these shutdown operations are spread around is not always clear and it makes the tick lifecycle hard to follow. The tick should be shut down in order from highest to lowest level: On stop machine from the dying CPU (high-level): 1) Hand-over the timekeeping duty (tick_handover_do_timer()) 2) Cancel the tick implementation called by the clockevent callback (tick_cancel_sched_timer()) 3) Shutdown broadcasting (tick_offline_cpu() / tick_broadcast_offline()) On stop machine from the dying CPU (low-level): 4) Shutdown clockevents drivers (CPUHP_AP_*_TIMER_STARTING states) From the control CPU after the CPU died (low-level): 5) Shutdown/unregister/cleanup clockevents for the dead CPU (tick_cleanup_dead_cpu()) Instead the current order is 2, 4 (both from CPU hotplug states), then 1 and 3 through direct calls. This layout and order don't make much sense. The operations 1, 2, 3 should be gathered together and in order. Sort this situation with creating a new TICK shut-down CPU hotplug state and start with introducing the timekeeping duty hand-over there. The state must precede hrtimers migration because the tick hrtimer will be stopped from it in a further patch. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240225225508.11587-8-frederic@kernel.org --- include/linux/cpuhotplug.h | 1 + include/linux/tick.h | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 7651904c6db5..35e78ddb2b37 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -184,6 +184,7 @@ enum cpuhp_state { CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, CPUHP_AP_HRTIMERS_DYING, + CPUHP_AP_TICK_DYING, CPUHP_AP_X86_TBOOT_DYING, CPUHP_AP_ARM_CACHE_B15_RAC_DYING, CPUHP_AP_ONLINE, diff --git a/include/linux/tick.h b/include/linux/tick.h index 716d17f31c45..afff4c207bd8 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -19,16 +19,20 @@ extern void __init tick_init(void); extern void tick_suspend_local(void); /* Should be core only, but XEN resume magic and ARM BL switcher require it */ extern void tick_resume_local(void); -extern void tick_handover_do_timer(void); extern void tick_cleanup_dead_cpu(int cpu); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } -static inline void tick_handover_do_timer(void) { } static inline void tick_cleanup_dead_cpu(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_HOTPLUG_CPU) +extern int tick_cpu_dying(unsigned int cpu); +#else +#define tick_cpu_dying NULL +#endif + #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_SUSPEND) extern void tick_freeze(void); extern void tick_unfreeze(void); -- cgit v1.2.3 From ef8969bb552c1c75e997a42d3e2c576b6ed4025a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 25 Feb 2024 23:55:01 +0100 Subject: tick: Move broadcast cancellation up to CPUHP_AP_TICK_DYING The broadcast shutdown code is executed through a random explicit call within stop machine from the outgoing CPU. However the tick broadcast is a midware between the tick callback and the clocksource, therefore it makes more sense to shut it down after the tick callback and before the clocksource drivers. Move it instead to the common tick shutdown CPU hotplug state where related operations can be ordered from highest to lowest level. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240225225508.11587-10-frederic@kernel.org --- include/linux/tick.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index afff4c207bd8..c7840ae8ebaf 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -73,12 +73,6 @@ extern void tick_broadcast_control(enum tick_broadcast_mode mode); static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { } #endif /* BROADCAST */ -#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) -extern void tick_offline_cpu(unsigned int cpu); -#else -static inline void tick_offline_cpu(unsigned int cpu) { } -#endif - #ifdef CONFIG_GENERIC_CLOCKEVENTS extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state); #else -- cgit v1.2.3 From 500f8f9bced86f0c0f2482773bd64a1b7ec9c4e1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 25 Feb 2024 23:55:07 +0100 Subject: tick: Assume timekeeping is correctly handed over upon last offline idle call The timekeeping duty is handed over from the outgoing CPU on stop machine, then the oneshot tick is stopped right after. Therefore it's guaranteed that the current CPU isn't the timekeeper upon its last call to idle. Besides, calling tick_nohz_idle_stop_tick() while the dying CPU goes into idle suggests that the tick is going to be stopped while it is actually stopped already from the appropriate CPU hotplug state. Remove the confusing call and the obsolete case handling and convert it to a sanity check that verifies the above assumption. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240225225508.11587-16-frederic@kernel.org --- include/linux/tick.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index c7840ae8ebaf..44fddfa93e18 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -29,8 +29,10 @@ static inline void tick_cleanup_dead_cpu(int cpu) { } #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_HOTPLUG_CPU) extern int tick_cpu_dying(unsigned int cpu); +extern void tick_assert_timekeeping_handover(void); #else #define tick_cpu_dying NULL +static inline void tick_assert_timekeeping_handover(void) { } #endif #if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_SUSPEND) -- cgit v1.2.3 From 4a5917cd504c7afd5e9de7166eb710687a9b026f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 13 Feb 2024 12:48:52 +0200 Subject: clk: ti: Improve clksel clock bit parsing for reg property Because of legacy reasons, the TI clksel composite clocks can have overlapping reg properties, and use a custom ti,bit-shift property. For the clksel clocks we can start using of the standard reg property instead of the custom ti,bit-shift property. To do this, let's add a ti_clk_get_legacy_bit_shift() helper, and make ti_clk_get_reg_addr() populate the clock bit offset. This makes it possible to update the devicetree files to use the reg property one clock at a time. Acked-by: Stephen Boyd Signed-off-by: Tony Lindgren --- include/linux/clk/ti.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index cbfcbf186ce3..e656f63efdce 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -13,11 +13,14 @@ /** * struct clk_omap_reg - OMAP register declaration * @offset: offset from the master IP module base address + * @bit: register bit offset * @index: index of the master IP module + * @flags: flags */ struct clk_omap_reg { void __iomem *ptr; u16 offset; + u8 bit; u8 index; u8 flags; }; -- cgit v1.2.3 From 8afc7a78d55de726b2747d7775c54def79509ec5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Feb 2024 10:50:10 +0000 Subject: ipv6: prepare inet6_fill_ifinfo() for RCU protection We want to use RCU protection instead of RTNL for inet6_fill_ifinfo(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f07c8374f29c..09023e44db4e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4354,8 +4354,10 @@ static inline bool netif_testing(const struct net_device *dev) */ static inline bool netif_oper_up(const struct net_device *dev) { - return (dev->operstate == IF_OPER_UP || - dev->operstate == IF_OPER_UNKNOWN /* backward compat */); + unsigned int operstate = READ_ONCE(dev->operstate); + + return operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN /* backward compat */; } /** -- cgit v1.2.3 From 386520e0ecc01004d3a29c70c5a77d4bbf8a8420 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Feb 2024 10:50:15 +0000 Subject: rtnetlink: add RTNL_FLAG_DUMP_UNLOCKED flag Similarly to RTNL_FLAG_DOIT_UNLOCKED, this new flag allows dump operations registered via rtnl_register() or rtnl_register_module() to opt-out from RTNL protection. Signed-off-by: Eric Dumazet Reviewed-by: Donald Hunter Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 1a4445bf2ab9..5df7340d4dab 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -291,6 +291,7 @@ struct netlink_callback { u16 answer_flags; u32 min_dump_alloc; unsigned int prev_seq, seq; + int flags; bool strict_check; union { u8 ctx[48]; @@ -323,6 +324,7 @@ struct netlink_dump_control { void *data; struct module *module; u32 min_dump_alloc; + int flags; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, -- cgit v1.2.3 From 4d52f575e258c6f93f4180c21afda8634b0d2af5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 20 Feb 2024 09:36:24 +0100 Subject: regulator: max8973: Finalize switch to GPIO descriptors The dvs gpio was still using a legacy number passed from the platform data. There are no in-tree users of the platform data so just switch it to a gpio descriptor and obtain it in probe(), the device tree users will work just as fine with this. Drop the entirely unused enable_gpio from the platform data as well. The device tree bindings mentions this but the driver does not look for it and makes no use of it: it should probably be implemented properly in a separate patch. Signed-off-by: Linus Walleij Link: https://msgid.link/r/20240220-descriptors-regulators-v1-1-097f608694be@linaro.org Acked-by: Lee Jones Signed-off-by: Mark Brown --- include/linux/regulator/max8973-regulator.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/max8973-regulator.h b/include/linux/regulator/max8973-regulator.h index 8313e7ed6aec..a225e9eeb30d 100644 --- a/include/linux/regulator/max8973-regulator.h +++ b/include/linux/regulator/max8973-regulator.h @@ -48,10 +48,6 @@ * control signal from EN input pin. If it is false then * voltage output will be enabled/disabled through EN bit of * device register. - * @enable_gpio: Enable GPIO. If EN pin is controlled through GPIO from host - * then GPIO number can be provided. If no GPIO controlled then - * it should be -1. - * @dvs_gpio: GPIO for dvs. It should be -1 if this is tied with fixed logic. * @dvs_def_state: Default state of dvs. 1 if it is high else 0. */ struct max8973_regulator_platform_data { @@ -59,8 +55,6 @@ struct max8973_regulator_platform_data { unsigned long control_flags; unsigned long junction_temp_warning; bool enable_ext_control; - int enable_gpio; - int dvs_gpio; unsigned dvs_def_state:1; }; -- cgit v1.2.3 From e450a2b3a335332d4a51fe10c9fff8150c6e2364 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 20 Feb 2024 09:36:25 +0100 Subject: regulator: da9055: Fully convert to GPIO descriptors The DA9055 regulator was touched before, requireing enable GPIOs to be passed from pdata. As we have a device for each regulator, obtain the three gpios ren ("regulator enable"), rsel ("regulator select") and the ena ("enable") GPIO associated with the regulator enable directly from the device and cut down on the amount of GPIO numbers passed as platform data. The ren and rsel are just requested as inputs: these are actually handled by hardware. The ena gpios are driven actively by the regulator core. There are no in-tree users, but the regulators are instantiated from the (undocumed) device tree nodes with "dlg,da9055-regulator" as compatible, and by simply adding regulator-enable-gpios, regulator-select-gpios and enable-gpios to this DT node, all will work as before. Signed-off-by: Linus Walleij Link: https://msgid.link/r/20240220-descriptors-regulators-v1-2-097f608694be@linaro.org Acked-by: Lee Jones Signed-off-by: Mark Brown --- include/linux/mfd/da9055/pdata.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/da9055/pdata.h b/include/linux/mfd/da9055/pdata.h index d3f126990ad0..137a2b067512 100644 --- a/include/linux/mfd/da9055/pdata.h +++ b/include/linux/mfd/da9055/pdata.h @@ -7,7 +7,6 @@ #define DA9055_MAX_REGULATORS 8 struct da9055; -struct gpio_desc; enum gpio_select { NO_GPIO = 0, @@ -23,16 +22,6 @@ struct da9055_pdata { struct regulator_init_data *regulators[DA9055_MAX_REGULATORS]; /* Enable RTC in RESET Mode */ bool reset_enable; - /* - * GPI muxed pin to control - * regulator state A/B, 0 if not available. - */ - int *gpio_ren; - /* - * GPI muxed pin to control - * regulator set, 0 if not available. - */ - int *gpio_rsel; /* * Regulator mode control bits value (GPI offset) that * controls the regulator state, 0 if not available. @@ -43,7 +32,5 @@ struct da9055_pdata { * controls the regulator set A/B, 0 if not available. */ enum gpio_select *reg_rsel; - /* GPIO descriptors to enable regulator, NULL if not available */ - struct gpio_desc **ena_gpiods; }; #endif /* __DA9055_PDATA_H */ -- cgit v1.2.3 From 95daa868f22b509ad641bf003d9d441d6a2fa505 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 20 Feb 2024 09:36:26 +0100 Subject: regulator: lp8788-buck: Fully convert to GPIO descriptors This converts the LP8788 BUCK regulator driver to use GPIO descriptors. BUCK1 can use one DVS GPIO and BUCK2 can use two DVS GPIOS, and no more so just hardcode two GPIO descriptors into the per-DVS state containers. Obtain the descriptors from each regulators subdevice. As there are no in-tree users, board files need to populate descriptor tables for the buck regulator devices when they want to use this driver. BUCK1 need a GPIO descriptor at index 0 and BUCK2 needs two GPIO descriptors at indices 0 and 1. Signed-off-by: Linus Walleij Link: https://msgid.link/r/20240220-descriptors-regulators-v1-3-097f608694be@linaro.org Acked-by: Lee Jones Signed-off-by: Mark Brown --- include/linux/mfd/lp8788.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/lp8788.h b/include/linux/mfd/lp8788.h index 3d5c480d58ea..51b47966a04d 100644 --- a/include/linux/mfd/lp8788.h +++ b/include/linux/mfd/lp8788.h @@ -10,7 +10,6 @@ #ifndef __MFD_LP8788_H__ #define __MFD_LP8788_H__ -#include #include #include #include @@ -159,21 +158,17 @@ struct lp8788; /* * lp8788_buck1_dvs - * @gpio : gpio pin number for dvs control * @vsel : dvs selector for buck v1 register */ struct lp8788_buck1_dvs { - int gpio; enum lp8788_dvs_sel vsel; }; /* * lp8788_buck2_dvs - * @gpio : two gpio pin numbers are used for dvs * @vsel : dvs selector for buck v2 register */ struct lp8788_buck2_dvs { - int gpio[LP8788_NUM_BUCK2_DVS]; enum lp8788_dvs_sel vsel; }; @@ -268,8 +263,8 @@ struct lp8788_vib_platform_data { * @buck_data : regulator initial data for buck * @dldo_data : regulator initial data for digital ldo * @aldo_data : regulator initial data for analog ldo - * @buck1_dvs : gpio configurations for buck1 dvs - * @buck2_dvs : gpio configurations for buck2 dvs + * @buck1_dvs : configurations for buck1 dvs + * @buck2_dvs : configurations for buck2 dvs * @chg_pdata : platform data for charger driver * @alarm_sel : rtc alarm selection (1 or 2) * @bl_pdata : configurable data for backlight driver -- cgit v1.2.3 From 84618d5e31cfd01fc3f53a8c2ebb68bc43d8b760 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 20 Feb 2024 09:36:27 +0100 Subject: regulator: max8997: Convert to GPIO descriptors This rewrites the max8997 regulator driver to fetch the dvs regulators as descriptors. This will likely mostly come from the device tree since there are no in-tree users of the platform data, but supplying GPIO descriptor tables from board files is also possible if needed. Signed-off-by: Linus Walleij Link: https://msgid.link/r/20240220-descriptors-regulators-v1-4-097f608694be@linaro.org Acked-by: Lee Jones Signed-off-by: Mark Brown --- include/linux/mfd/max8997.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max8997.h b/include/linux/mfd/max8997.h index 6193905abbb5..5c2cc1103437 100644 --- a/include/linux/mfd/max8997.h +++ b/include/linux/mfd/max8997.h @@ -178,7 +178,6 @@ struct max8997_platform_data { * */ bool ignore_gpiodvs_side_effect; - int buck125_gpios[3]; /* GPIO of [0]SET1, [1]SET2, [2]SET3 */ int buck125_default_idx; /* Default value of SET1, 2, 3 */ unsigned int buck1_voltage[8]; /* buckx_voltage in uV */ bool buck1_gpiodvs; -- cgit v1.2.3 From f25828a1eae1ee1a9257e2818b237b8208bd383e Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 20 Feb 2024 09:36:28 +0100 Subject: regulator: max8998: Convert to GPIO descriptors This rewrites the max8998 regulator driver to fetch the dvs regulators as descriptors. This will likely mostly come from the device tree since there are no in-tree users of the platform data, but supplying GPIO descriptor tables from board files is also possible if needed. Signed-off-by: Linus Walleij Link: https://msgid.link/r/20240220-descriptors-regulators-v1-5-097f608694be@linaro.org Acked-by: Lee Jones Signed-off-by: Mark Brown --- include/linux/mfd/max8998.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h index 79c020bd0c70..a054e55c8646 100644 --- a/include/linux/mfd/max8998.h +++ b/include/linux/mfd/max8998.h @@ -65,10 +65,7 @@ struct max8998_regulator_data { * be other than the preset values. * @buck1_voltage: BUCK1 DVS mode 1 voltage registers * @buck2_voltage: BUCK2 DVS mode 2 voltage registers - * @buck1_set1: BUCK1 gpio pin 1 to set output voltage - * @buck1_set2: BUCK1 gpio pin 2 to set output voltage * @buck1_default_idx: Default for BUCK1 gpio pin 1, 2 - * @buck2_set3: BUCK2 gpio pin to set output voltage * @buck2_default_idx: Default for BUCK2 gpio pin. * @wakeup: Allow to wake up from suspend * @rtc_delay: LP3974 RTC chip bug that requires delay after a register @@ -91,10 +88,7 @@ struct max8998_platform_data { bool buck_voltage_lock; int buck1_voltage[4]; int buck2_voltage[2]; - int buck1_set1; - int buck1_set2; int buck1_default_idx; - int buck2_set3; int buck2_default_idx; bool wakeup; bool rtc_delay; -- cgit v1.2.3 From 7b1d87af14d9ae902ed0c5dc5fabf4eea5abdf02 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 19 Feb 2024 16:33:18 -0600 Subject: spi: add spi_optimize_message() APIs This adds a new spi_optimize_message() function that can be used to optimize SPI messages that are used more than once. Peripheral drivers that use the same message multiple times can use this API to perform SPI message validation and controller-specific optimizations once and then reuse the message while avoiding the overhead of revalidating the message on each spi_(a)sync() call. Internally, the SPI core will also call this function for each message if the peripheral driver did not explicitly call it. This is done to so that controller drivers don't have to have multiple code paths for optimized and non-optimized messages. A hook is provided for controller drivers to perform controller-specific optimizations. Suggested-by: Martin Sperl Link: https://lore.kernel.org/linux-spi/39DEC004-10A1-47EF-9D77-276188D2580C@martin.sperl.org/ Signed-off-by: David Lechner Link: https://msgid.link/r/20240219-mainline-spi-precook-message-v2-1-4a762c6701b9@baylibre.com Reviewed-by: Jonathan Cameron Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 2b8e2746769a..ddfb66dd4caf 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -475,6 +475,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * * @set_cs: set the logic level of the chip select line. May be called * from interrupt context. + * @optimize_message: optimize the message for reuse + * @unoptimize_message: release resources allocated by optimize_message * @prepare_message: set up the controller to transfer a single message, * for example doing DMA mapping. Called from threaded * context. @@ -715,6 +717,8 @@ struct spi_controller { struct completion xfer_completion; size_t max_dma_len; + int (*optimize_message)(struct spi_message *msg); + int (*unoptimize_message)(struct spi_message *msg); int (*prepare_transfer_hardware)(struct spi_controller *ctlr); int (*transfer_one_message)(struct spi_controller *ctlr, struct spi_message *mesg); @@ -1111,6 +1115,8 @@ struct spi_transfer { * @spi: SPI device to which the transaction is queued * @is_dma_mapped: if true, the caller provided both DMA and CPU virtual * addresses for each transfer buffer + * @pre_optimized: peripheral driver pre-optimized the message + * @optimized: the message is in the optimized state * @prepared: spi_prepare_message was called for the this message * @status: zero for success, else negative errno * @complete: called to report transaction completions @@ -1120,6 +1126,7 @@ struct spi_transfer { * successful segments * @queue: for use by whichever driver currently owns the message * @state: for use by whichever driver currently owns the message + * @opt_state: for use by whichever driver currently owns the message * @resources: for resource management when the SPI message is processed * * A @spi_message is used to execute an atomic sequence of data transfers, @@ -1143,6 +1150,11 @@ struct spi_message { unsigned is_dma_mapped:1; + /* spi_optimize_message() was called for this message */ + bool pre_optimized; + /* __spi_optimize_message() was called for this message */ + bool optimized; + /* spi_prepare_message() was called for this message */ bool prepared; @@ -1172,6 +1184,11 @@ struct spi_message { */ struct list_head queue; void *state; + /* + * Optional state for use by controller driver between calls to + * __spi_optimize_message() and __spi_unoptimize_message(). + */ + void *opt_state; /* List of spi_res resources when the SPI message is processed */ struct list_head resources; @@ -1255,6 +1272,9 @@ static inline void spi_message_free(struct spi_message *m) kfree(m); } +extern int spi_optimize_message(struct spi_device *spi, struct spi_message *msg); +extern void spi_unoptimize_message(struct spi_message *msg); + extern int spi_setup(struct spi_device *spi); extern int spi_async(struct spi_device *spi, struct spi_message *message); extern int spi_slave_abort(struct spi_device *spi); -- cgit v1.2.3 From 3d8bb3d3080d1609c2a6bef007ede2d8f8ffea5b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 22 Feb 2024 11:25:12 +0100 Subject: gpio: provide for_each_hwgpio() We only provide iterators for requested GPIOs to provider drivers. In order to allow them to display debug information about all GPIOs, let's provide a variant for iterating over all GPIOs. Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij --- include/linux/gpio/driver.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9c1fbfaebaa8..175129a92656 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -551,6 +551,21 @@ DEFINE_CLASS(_gpiochip_for_each_data, }), const char **label, int *i) +/** + * for_each_hwgpio - Iterates over all GPIOs for given chip. + * @_chip: Chip to iterate over. + * @_i: Loop counter. + * @_label: Place to store the address of the label if the GPIO is requested. + * Set to NULL for unused GPIOs. + */ +#define for_each_hwgpio(_chip, _i, _label) \ + for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i); \ + *_data.i < _chip->ngpio; \ + (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL) \ + if (IS_ERR(*_data.label = \ + gpiochip_dup_line_label(_chip, *_data.i))) {} \ + else + /** * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range * @_chip: the chip to query -- cgit v1.2.3 From 0d60d8df6f493bb46bf5db40d39dd60a1bafdd4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Feb 2024 12:32:08 +0000 Subject: dpll: rely on rcu for netdev_dpll_pin() This fixes a possible UAF in if_nlmsg_size(), which can run without RTNL. Add rcu protection to "struct dpll_pin" Move netdev_dpll_pin() from netdevice.h to dpll.h to decrease name pollution. Note: This looks possible to no longer acquire RTNL in netdev_dpll_pin_assign() later in net-next. v2: do not force rcu_read_lock() in rtnl_dpll_pin_size() (Jiri Pirko) Fixes: 5f1842692880 ("netdev: expose DPLL pin handle for netdevice") Signed-off-by: Eric Dumazet Cc: Arkadiusz Kubalewski Cc: Vadim Fedorenko Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240223123208.3543319-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 11 +++++++++++ include/linux/netdevice.h | 11 +---------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 9cf896ea1d41..4ec2fe9caf5a 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include struct dpll_device; struct dpll_pin; @@ -167,4 +169,13 @@ int dpll_device_change_ntf(struct dpll_device *dpll); int dpll_pin_change_ntf(struct dpll_pin *pin); +static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_DPLL) + return rcu_dereference_rtnl(dev->dpll_pin); +#else + return NULL; +#endif +} + #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ef7bfbb98497..a9c973b92294 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2469,7 +2469,7 @@ struct net_device { struct devlink_port *devlink_port; #if IS_ENABLED(CONFIG_DPLL) - struct dpll_pin *dpll_pin; + struct dpll_pin __rcu *dpll_pin; #endif #if IS_ENABLED(CONFIG_PAGE_POOL) /** @page_pools: page pools created for this netdevice */ @@ -4035,15 +4035,6 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); void netdev_dpll_pin_clear(struct net_device *dev); -static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) -{ -#if IS_ENABLED(CONFIG_DPLL) - return dev->dpll_pin; -#else - return NULL; -#endif -} - struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -- cgit v1.2.3 From 4c8a49854130da0117a0fdb858551824919a2389 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Feb 2024 09:58:15 +0100 Subject: smp: Avoid 'setup_max_cpus' namespace collision/shadowing bringup_nonboot_cpus() gets passed the 'setup_max_cpus' variable in init/main.c - which is also the name of the parameter, shadowing the name. To reduce confusion and to allow the 'setup_max_cpus' value to be #defined in the header, use the 'max_cpus' name for the function parameter name. Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org --- include/linux/cpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index dcb89c987164..61a0ddf2bef6 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -112,7 +112,7 @@ void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); extern void cpu_maps_update_done(void); int bringup_hibernate_cpu(unsigned int sleep_cpu); -void bringup_nonboot_cpus(unsigned int setup_max_cpus); +void bringup_nonboot_cpus(unsigned int max_cpus); #else /* CONFIG_SMP */ #define cpuhp_tasks_frozen 0 -- cgit v1.2.3 From 3c2f8859ae1ce53f2a89c8e4ca4092101afbff67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 26 Feb 2024 12:07:31 +0100 Subject: smp: Provide 'setup_max_cpus' definition on UP too This was already defined locally by init/main.c, but let's make it generic, as arch/x86/kernel/cpu/topology.c is going to make use of it to have more uniform code. Reviewed-by: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/smp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index e87520dc2959..7a83fd2fccbe 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -218,6 +218,8 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, static inline void kick_all_cpus_sync(void) { } static inline void wake_up_all_idle_cpus(void) { } +#define setup_max_cpus 0 + #ifdef CONFIG_UP_LATE_INIT extern void __init up_late_init(void); static inline void smp_init(void) { up_late_init(); } -- cgit v1.2.3 From 5340f7647294fa8ff8cf5a1bee326b2bd8340e27 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 22 Feb 2024 18:30:49 +0100 Subject: thermal: core: Add flags to struct thermal_trip In order to allow thermal zone creators to specify the writability of trip point temperature and hysteresis on a per-trip basis, add a flags field to struct thermal_trip and define flags to represent the desired trip properties. Also make thermal_zone_device_register_with_trips() set the THERMAL_TRIP_FLAG_RW_TEMP flag for all trips covered by the writable trips mask passed to it and modify the thermal sysfs code to look at the trip flags instead of using the writable trips mask directly or checking the presence of the .set_trip_hyst() zone callback. Additionally, make trip_point_temp_store() and trip_point_hyst_store() fail with an error code if the trip passed to one of them has THERMAL_TRIP_FLAG_RW_TEMP or THERMAL_TRIP_FLAG_RW_HYST, respectively, clear in its flags. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index ec0559e98d6f..6eb6f3297ea0 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -64,15 +64,23 @@ enum thermal_notify_event { * @threshold: trip crossing notification threshold miliCelsius * @type: trip point type * @priv: pointer to driver data associated with this trip + * @flags: flags representing binary properties of the trip */ struct thermal_trip { int temperature; int hysteresis; int threshold; enum thermal_trip_type type; + u8 flags; void *priv; }; +#define THERMAL_TRIP_FLAG_RW_TEMP BIT(0) +#define THERMAL_TRIP_FLAG_RW_HYST BIT(1) + +#define THERMAL_TRIP_FLAG_RW (THERMAL_TRIP_FLAG_RW_TEMP | \ + THERMAL_TRIP_FLAG_RW_HYST) + struct thermal_zone_device_ops { int (*bind) (struct thermal_zone_device *, struct thermal_cooling_device *); -- cgit v1.2.3 From 46f5bef8ec2e1e05ad2fda0bcf5ac32e191ec694 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 22 Feb 2024 18:32:03 +0100 Subject: thermal: core: Drop the .set_trip_hyst() thermal zone operation None of the users of the thermal core provides a .set_trip_hyst() thermal zone operation, so drop that callback from struct thermal_zone_device_ops and update trip_point_hyst_store() accordingly. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Daniel Lezcano --- include/linux/thermal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 6eb6f3297ea0..eb4145a64c7e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -91,7 +91,6 @@ struct thermal_zone_device_ops { int (*change_mode) (struct thermal_zone_device *, enum thermal_device_mode); int (*set_trip_temp) (struct thermal_zone_device *, int, int); - int (*set_trip_hyst) (struct thermal_zone_device *, int, int); int (*get_crit_temp) (struct thermal_zone_device *, int *); int (*set_emul_temp) (struct thermal_zone_device *, int); int (*get_trend) (struct thermal_zone_device *, -- cgit v1.2.3 From 4a62d588a84e13c68017bd16bc9c2531a2cde08f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 22 Feb 2024 19:09:16 +0100 Subject: thermal: core: Eliminate writable trip points masks All of the thermal_zone_device_register_with_trips() callers pass zero writable trip points masks to it, so drop the mask argument from that function and update all of its callers accordingly. This also removes the artificial trip points per zone limit of 32, related to using writable trip points masks. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Daniel Lezcano --- include/linux/thermal.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index eb4145a64c7e..c33f50177f51 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -323,8 +323,7 @@ int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp); struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, const struct thermal_trip *trips, - int num_trips, int mask, - void *devdata, + int num_trips, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, int passive_delay, int polling_delay); @@ -383,8 +382,7 @@ void thermal_zone_device_critical(struct thermal_zone_device *tz); static inline struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, const struct thermal_trip *trips, - int num_trips, int mask, - void *devdata, + int num_trips, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, int passive_delay, int polling_delay) -- cgit v1.2.3 From 4f299135d5668f56be270d224d41eb83d2002038 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 19 Feb 2024 12:59:15 +0100 Subject: platform/x86: wmi: Prevent incompatible event driver from probing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a WMI event driver has no_notify_data set, then it indicates support for WMI events which provide no notify data, otherwise the notify() callback expects a valid ACPI object as notify data. However if a WMI event driver which requires notify data is bound to a WMI event device which cannot retrieve such data due to the _WED ACPI method being absent, then the driver will be dysfunctional since all WMI events will be dropped due to the missing notify data. Fix this by not allowing such WMI event drivers to bind to WMI event devices which do not support retrieving of notify data. Also reword the description of no_notify_data a bit. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20240219115919.16526-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 686291b87852..781958310bfb 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -48,7 +48,7 @@ u8 wmidev_instance_count(struct wmi_device *wdev); * struct wmi_driver - WMI driver structure * @driver: Driver model structure * @id_table: List of WMI GUIDs supported by this driver - * @no_notify_data: WMI events provide no event data + * @no_notify_data: Driver supports WMI events which provide no event data * @probe: Callback for device binding * @remove: Callback for device unbinding * @notify: Callback for receiving WMI events -- cgit v1.2.3 From c3f9109dbc9e2cd0b2c3ba0536431eef282783e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Feb 2024 21:38:59 -0700 Subject: io_uring/kbuf: flag request if buffer pool is empty after buffer pick Normally we do an extra roundtrip for retries even if the buffer pool has depleted, as we don't check that upfront. Rather than add this check, have the buffer selection methods mark the request with REQ_F_BL_EMPTY if the used buffer group is out of buffers after this selection. This is very cheap to do once we're all the way inside there anyway, and it gives the caller a chance to make better decisions on how to proceed. For example, recv/recvmsg multishot could check this flag when it decides whether to keep receiving or not. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index bd7071aeec5d..d8111d64812b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -480,6 +480,7 @@ enum { REQ_F_POLL_NO_LAZY_BIT, REQ_F_CANCEL_SEQ_BIT, REQ_F_CAN_POLL_BIT, + REQ_F_BL_EMPTY_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -556,6 +557,8 @@ enum { REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), /* file is pollable */ REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), + /* buffer list was empty after selection of buffer */ + REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); -- cgit v1.2.3 From 60b2ebf48526567b53e0188dbd1a4df8e646bcc1 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Tue, 27 Feb 2024 19:10:37 +0000 Subject: workqueue: Introduce from_work() helper for cleaner callback declarations To streamline the transition from tasklets to worqueues, a new helper function, from_work(), is introduced. This helper, inspired by existing from_() patterns, utilizes container_of() and eliminates the redundancy of declaring variable types, leading to more concise and readable code. The modified code snippet demonstrates the enhanced clarity achieved with from_wq(): void callback(struct work_struct *w) { - struct some_data_structure *local = container_of(w, struct some_data_structure, work); + struct some_data_structure *local = from_work(local, w, work); This change aims to facilitate a smoother transition and uphold code quality standards. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git disable_work-v3 Signed-off-by: Allen Pais Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0ad534fe6673..64a60b9232d3 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -522,6 +522,9 @@ alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) +#define from_work(var, callback_work, work_fieldname) \ + container_of(callback_work, typeof(*var), work_fieldname) + extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); -- cgit v1.2.3 From 2824083db76cb9d4b7910607b367e93b02912865 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:03 -0500 Subject: ovl: Always reject mounting over case-insensitive directories overlayfs relies on the filesystem setting DCACHE_OP_HASH or DCACHE_OP_COMPARE to reject mounting over case-insensitive directories. Since commit bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops"), we set ->d_op through a hook in ->d_lookup, which means the root dentry won't have them, causing the mount to accidentally succeed. In v6.7-rc7, the following sequence will succeed to mount, but any dentry other than the root dentry will be a "weird" dentry to ovl and fail with EREMOTE. mkfs.ext4 -O casefold lower.img mount -O loop lower.img lower mount -t overlay -o lowerdir=lower,upperdir=upper,workdir=work ovl /mnt Mounting on a subdirectory fails, as expected, because DCACHE_OP_HASH and DCACHE_OP_COMPARE are properly set by ->lookup. Fix by explicitly rejecting superblocks that allow case-insensitive dentries. Yes, this will be solved when we move d_op configuration back to ->s_d_op. Yet, we better have an explicit fix to avoid messing up again. While there, re-sort the entries to have more descriptive error messages first. Fixes: bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops") Acked-by: Amir Goldstein Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-2-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e6ba0cc6f2ee..a0eb8b5759a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3282,6 +3282,15 @@ extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); +static inline bool sb_has_encoding(const struct super_block *sb) +{ +#if IS_ENABLED(CONFIG_UNICODE) + return !!sb->s_encoding; +#else + return false; +#endif +} + int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); -- cgit v1.2.3 From 8b6bb995d3819218498bdbee4465bffff1497a31 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:04 -0500 Subject: fscrypt: Factor out a helper to configure the lookup dentry Both fscrypt_prepare_lookup_partial and fscrypt_prepare_lookup will set DCACHE_NOKEY_NAME for dentries when the key is not available. Extract out a helper to set this flag in a single place, in preparation to also add the optimization that will disable ->d_revalidate if possible. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-3-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fscrypt.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 12f9e455d569..c76f859cf019 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -261,6 +261,16 @@ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) return dentry->d_flags & DCACHE_NOKEY_NAME; } +static inline void fscrypt_prepare_dentry(struct dentry *dentry, + bool is_nokey_name) +{ + if (is_nokey_name) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dentry->d_lock); + } +} + /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); @@ -425,6 +435,11 @@ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) return false; } +static inline void fscrypt_prepare_dentry(struct dentry *dentry, + bool is_nokey_name) +{ +} + /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { -- cgit v1.2.3 From e86e6638d1171c2201ffff16d2c6a6fd975f8383 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:05 -0500 Subject: fscrypt: Drop d_revalidate for valid dentries during lookup Unencrypted and encrypted-dentries where the key is available don't need to be revalidated by fscrypt, since they don't go stale from under VFS and the key cannot be removed for the encrypted case without evicting the dentry. Disable their d_revalidate hook on the first lookup, to avoid repeated revalidation later. This is done in preparation to always configuring d_op through sb->s_d_op. The only part detail is that, since the filesystem might have other features that require revalidation, we only apply this optimization if the d_revalidate handler is fscrypt_d_revalidate itself. Finally, we need to clean the dentry->flags even for unencrypted dentries, so the ->d_lock might be acquired even for them. In order to avoid doing it for filesystems that don't care about fscrypt at all, we peek ->d_flags without the lock at first, and only acquire it if we actually need to write the flag. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-4-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fscrypt.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index c76f859cf019..78af02b35bd9 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -264,10 +264,29 @@ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) static inline void fscrypt_prepare_dentry(struct dentry *dentry, bool is_nokey_name) { + /* + * This code tries to only take ->d_lock when necessary to write + * to ->d_flags. We shouldn't be peeking on d_flags for + * DCACHE_OP_REVALIDATE unlocked, but in the unlikely case + * there is a race, the worst it can happen is that we fail to + * unset DCACHE_OP_REVALIDATE and pay the cost of an extra + * d_revalidate. + */ if (is_nokey_name) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); + } else if (dentry->d_flags & DCACHE_OP_REVALIDATE && + dentry->d_op->d_revalidate == fscrypt_d_revalidate) { + /* + * Unencrypted dentries and encrypted dentries where the + * key is available are always valid from fscrypt + * perspective. Avoid the cost of calling + * fscrypt_d_revalidate unnecessarily. + */ + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_OP_REVALIDATE; + spin_unlock(&dentry->d_lock); } } @@ -997,6 +1016,9 @@ static inline int fscrypt_prepare_lookup(struct inode *dir, fname->usr_fname = &dentry->d_name; fname->disk_name.name = (unsigned char *)dentry->d_name.name; fname->disk_name.len = dentry->d_name.len; + + fscrypt_prepare_dentry(dentry, false); + return 0; } -- cgit v1.2.3 From e9b10713e82c98a0a909ac55bd485b7d7ff91b85 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:06 -0500 Subject: fscrypt: Drop d_revalidate once the key is added When a key is added, existing directory dentries in the DCACHE_NOKEY_NAME form are moved by the VFS to the plaintext version. But, since they have the DCACHE_OP_REVALIDATE flag set, revalidation will be done at each lookup only to return immediately, since plaintext dentries can't go stale until eviction. This patch optimizes this case, by dropping the flag once the nokey_name dentry becomes plain-text. Note that non-directory dentries are not moved this way, so they won't be affected. Of course, this can only be done if fscrypt is the only thing requiring revalidation for a dentry. For this reason, we only disable d_revalidate if the .d_revalidate hook is fscrypt_d_revalidate itself. It is safe to do it here because when moving the dentry to the plain-text version, we are holding the d_lock. We might race with a concurrent RCU lookup but this is harmless because, at worst, we will get an extra d_revalidate on the keyed dentry, which will still find the dentry to be valid. Finally, now that we do more than just clear the DCACHE_NOKEY_NAME in fscrypt_handle_d_move, skip it entirely for plaintext dentries, to avoid extra costs. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-5-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fscrypt.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 78af02b35bd9..772f822dc6b8 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -192,6 +192,8 @@ struct fscrypt_operations { unsigned int *num_devs); }; +int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); + static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { @@ -221,15 +223,29 @@ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) } /* - * When d_splice_alias() moves a directory's no-key alias to its plaintext alias - * as a result of the encryption key being added, DCACHE_NOKEY_NAME must be - * cleared. Note that we don't have to support arbitrary moves of this flag - * because fscrypt doesn't allow no-key names to be the source or target of a - * rename(). + * When d_splice_alias() moves a directory's no-key alias to its + * plaintext alias as a result of the encryption key being added, + * DCACHE_NOKEY_NAME must be cleared and there might be an opportunity + * to disable d_revalidate. Note that we don't have to support the + * inverse operation because fscrypt doesn't allow no-key names to be + * the source or target of a rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { - dentry->d_flags &= ~DCACHE_NOKEY_NAME; + /* + * VFS calls fscrypt_handle_d_move even for non-fscrypt + * filesystems. + */ + if (dentry->d_flags & DCACHE_NOKEY_NAME) { + dentry->d_flags &= ~DCACHE_NOKEY_NAME; + + /* + * Other filesystem features might be handling dentry + * revalidation, in which case it cannot be disabled. + */ + if (dentry->d_op->d_revalidate == fscrypt_d_revalidate) + dentry->d_flags &= ~DCACHE_OP_REVALIDATE; + } } /** @@ -397,7 +413,6 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); -int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -- cgit v1.2.3 From 70dfe3f0d239c2e8abc6a7bea24411031f85b652 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:08 -0500 Subject: libfs: Add helper to choose dentry operations at mount-time In preparation to drop the similar helper that sets d_op at lookup time, add a version to set the right d_op filesystem-wide, through sb->s_d_op. The operations structures are shared across filesystems supporting fscrypt and/or casefolding, therefore we can keep it in common libfs code. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-7-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index a0eb8b5759a6..383c5145465f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3281,6 +3281,7 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); +extern void generic_set_sb_d_ops(struct super_block *sb); static inline bool sb_has_encoding(const struct super_block *sb) { -- cgit v1.2.3 From 101c3fad29d7a0a90ff063b1aad586a0211911ec Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:12 -0500 Subject: libfs: Drop generic_set_encrypted_ci_d_ops No filesystems depend on it anymore, and it is generally a bad idea. Since all dentries should have the same set of dentry operations in case-insensitive capable filesystems, it should be propagated through ->s_d_op. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-11-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 383c5145465f..ff1338109b54 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3280,7 +3280,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); extern void generic_set_sb_d_ops(struct super_block *sb); static inline bool sb_has_encoding(const struct super_block *sb) -- cgit v1.2.3 From 11b4eedfc87de394ed8cc54dea87c745d37ff9dc Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 19 Feb 2024 10:37:29 +0100 Subject: fbdev: Do not include in header Forward declare struct backlight_device and remove the include statement. Signed-off-by: Thomas Zimmermann Reviewed-by: Jani Nikula Acked-by: Helge Deller Link: https://patchwork.freedesktop.org/patch/msgid/20240219093941.3684-5-tzimmermann@suse.de --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 2ce2f5c2fca9..7380d959c5d5 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -13,11 +13,11 @@ #include #include #include -#include #include #include +struct backlight_device; struct vm_area_struct; struct fb_info; struct device; -- cgit v1.2.3 From 183c81569ddef5b7cc9b0403a9bdf9090e54c4f2 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 19 Feb 2024 10:37:30 +0100 Subject: fbdev: Do not include in header Forward declare struct inode and remove the include statement. Signed-off-by: Thomas Zimmermann Reviewed-by: Jani Nikula Acked-by: Helge Deller Link: https://patchwork.freedesktop.org/patch/msgid/20240219093941.3684-6-tzimmermann@suse.de --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 7380d959c5d5..f269ba520280 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -8,7 +8,6 @@ #define FBIO_CURSOR _IOWR('F', 0x08, struct fb_cursor_user) -#include #include #include #include @@ -22,6 +21,7 @@ struct vm_area_struct; struct fb_info; struct device; struct file; +struct inode; struct videomode; struct device_node; -- cgit v1.2.3 From 0f115335cff5caa53738e0240d6f7f0b85c72e14 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 19 Feb 2024 10:37:31 +0100 Subject: fbdev: Do not include in header Forward declare struct notifier_block and remove the include statement. Signed-off-by: Thomas Zimmermann Reviewed-by: Jani Nikula Acked-by: Helge Deller Link: https://patchwork.freedesktop.org/patch/msgid/20240219093941.3684-7-tzimmermann@suse.de --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index f269ba520280..90f348f14a49 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -10,7 +10,6 @@ #include #include -#include #include #include @@ -22,6 +21,7 @@ struct fb_info; struct device; struct file; struct inode; +struct notifier_block; struct videomode; struct device_node; -- cgit v1.2.3 From 7a46212f2a15cd8f041d2b4243ab64649c394260 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 19 Feb 2024 10:37:32 +0100 Subject: fbdev: Do not include in header Forward declare struct page and remove the include statement. Signed-off-by: Thomas Zimmermann Reviewed-by: Jani Nikula Acked-by: Helge Deller Link: https://patchwork.freedesktop.org/patch/msgid/20240219093941.3684-8-tzimmermann@suse.de --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 90f348f14a49..42155898374b 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -11,7 +11,6 @@ #include #include #include -#include #include @@ -22,6 +21,7 @@ struct device; struct file; struct inode; struct notifier_block; +struct page; struct videomode; struct device_node; -- cgit v1.2.3 From f6d520783a087fad2c5196f1231876312996cf07 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 19 Feb 2024 10:37:33 +0100 Subject: fbdev: Clean up forward declarations in header file Add forward declarations for struct i2c_adapter and struct module, and sort the list alphabetically. Signed-off-by: Thomas Zimmermann Reviewed-by: Jani Nikula Acked-by: Helge Deller Link: https://patchwork.freedesktop.org/patch/msgid/20240219093941.3684-9-tzimmermann@suse.de --- include/linux/fb.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 42155898374b..8f70ca727a30 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -15,15 +15,17 @@ #include struct backlight_device; -struct vm_area_struct; -struct fb_info; struct device; +struct device_node; +struct fb_info; struct file; +struct i2c_adapter; struct inode; +struct module; struct notifier_block; struct page; struct videomode; -struct device_node; +struct vm_area_struct; /* Definitions below are used in the parsed monitor specs */ #define FB_DPMS_ACTIVE_OFF 1 -- cgit v1.2.3 From 0c591381e4462005234f942d9fc36a369c0f5998 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 19 Feb 2024 10:37:34 +0100 Subject: fbdev: Clean up include statements in header file Include mutex.h, printk.h and types.h, remove several unnecessary include statements, and sort the list alphabetically. Signed-off-by: Thomas Zimmermann Reviewed-by: Jani Nikula Acked-by: Helge Deller Link: https://patchwork.freedesktop.org/patch/msgid/20240219093941.3684-10-tzimmermann@suse.de --- include/linux/fb.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 8f70ca727a30..708e6a177b1b 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -2,15 +2,15 @@ #ifndef _LINUX_FB_H #define _LINUX_FB_H -#include -#include #include #define FBIO_CURSOR _IOWR('F', 0x08, struct fb_cursor_user) -#include +#include +#include +#include +#include #include -#include #include -- cgit v1.2.3 From 416eb60317c64676d158dffea150762930ec008f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 23 Feb 2024 10:01:46 +0100 Subject: bitfield: suppress "dubious: x & !y" sparse warning There's a somewhat common pattern of using FIELD_PREP() even for single bits, e.g. cmd->info1 |= FIELD_PREP(HTT_SRNG_SETUP_CMD_INFO1_RING_FLAGS_MSI_SWAP, !!(params.flags & HAL_SRNG_FLAGS_MSI_SWAP)); which might as well be written as if (params.flags & HAL_SRNG_FLAGS_MSI_SWAP) cmd->info1 |= HTT_SRNG_SETUP_CMD_INFO1_RING_FLAGS_MSI_SWAP; (since info1 is fully initialized to start with), but in a long chain of FIELD_PREP() this really seems fine. However, it triggers a sparse warning, in the check in the macro for whether a constant value fits into the mask, as this contains a "& (_val)". In this case, this really is always intentional, so just suppress the warning by adding "0+" to the expression, indicating explicitly that this is correct. Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://msgid.link/20240223100146.d243b6b1a9a1.I033828b1187c6bccf086e31400f7e933bb8373e7@changeid --- include/linux/bitfield.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index ebfa12f69501..63928f173223 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -66,7 +66,8 @@ _pfx "mask is not constant"); \ BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \ BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ - ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ + ~((_mask) >> __bf_shf(_mask)) & \ + (0 + (_val)) : 0, \ _pfx "value too large for the field"); \ BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) > \ __bf_cast_unsigned(_reg, ~0ull), \ -- cgit v1.2.3 From d6cac0b6b0115fd0a5f51a49401473626e4e4fe7 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 22 Feb 2024 10:05:38 -0500 Subject: locking/mutex: Simplify CONFIG_DEBUG_MUTEXES and CONFIG_PREEMPT_RT are mutually exclusive. They can't be both set at the same time. Move up the mutex_destroy() function declaration and the __DEBUG_MUTEX_INITIALIZER() macro above the "#ifndef CONFIG_PREEMPT_RT" section to eliminate duplicated mutex_destroy() declaration. Also remove the duplicated mutex_trylock() function declaration in the CONFIG_PREEMPT_RT section. Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Reviewed-by: Boqun Feng Link: https://lore.kernel.org/r/20240222150540.79981-3-longman@redhat.com --- include/linux/mutex.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 7e208d46ba5b..67edc4ca2bee 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -32,11 +32,9 @@ # define __DEP_MAP_MUTEX_INITIALIZER(lockname) #endif -#ifndef CONFIG_PREEMPT_RT - #ifdef CONFIG_DEBUG_MUTEXES -#define __DEBUG_MUTEX_INITIALIZER(lockname) \ +# define __DEBUG_MUTEX_INITIALIZER(lockname) \ , .magic = &lockname extern void mutex_destroy(struct mutex *lock); @@ -49,6 +47,7 @@ static inline void mutex_destroy(struct mutex *lock) {} #endif +#ifndef CONFIG_PREEMPT_RT /** * mutex_init - initialize the mutex * @mutex: the mutex to be initialized @@ -101,9 +100,6 @@ extern bool mutex_is_locked(struct mutex *lock); extern void __mutex_rt_init(struct mutex *lock, const char *name, struct lock_class_key *key); -extern int mutex_trylock(struct mutex *lock); - -static inline void mutex_destroy(struct mutex *lock) { } #define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) -- cgit v1.2.3 From 292fac464b012200c4e99d08974fed3bc087b848 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 26 Feb 2024 19:29:15 -0600 Subject: net: ethtool: eee: Remove legacy _u32 from keee All MAC drivers have been converted to use the link mode members of keee. So remove the _u32 values, and the code in the ethtool core to convert the legacy _u32 values to link modes. Reviewed-by: Simon Horman Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/ethtool.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index b90c33607594..9901e563f706 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -226,9 +226,6 @@ struct ethtool_keee { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertised); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertised); - u32 supported_u32; - u32 advertised_u32; - u32 lp_advertised_u32; u32 tx_lpi_timer; bool tx_lpi_enabled; bool eee_active; -- cgit v1.2.3 From 02e765697038c596dc4a1126a13b018608365d81 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Tue, 9 Jan 2024 15:04:56 +0800 Subject: swiotlb: add debugfs to track swiotlb transient pool usage Introduce a new debugfs interface io_tlb_transient_nslabs. The device driver can create a new swiotlb transient memory pool once default memory pool is full. To export the swiotlb transient memory pool usage via debugfs would help the user estimate the size of transient swiotlb memory pool or analyze device driver memory leak issue. Signed-off-by: ZhangPeng Signed-off-by: Christoph Hellwig --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ecde0312dd52..ea23097e351f 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -120,6 +120,8 @@ struct io_tlb_pool { * debugfs. * @used_hiwater: The high water mark for total_used. Used only for reporting * in debugfs. + * @transient_nslabs: The total number of slots in all transient pools that + * are currently used across all areas. */ struct io_tlb_mem { struct io_tlb_pool defpool; @@ -137,6 +139,7 @@ struct io_tlb_mem { #ifdef CONFIG_DEBUG_FS atomic_long_t total_used; atomic_long_t used_hiwater; + atomic_long_t transient_nslabs; #endif }; -- cgit v1.2.3 From 54de442747037485da1fc4eca9636287a61e97e3 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 10 Feb 2024 19:39:23 +0800 Subject: sched/topology: Rename SD_SHARE_PKG_RESOURCES to SD_SHARE_LLC SD_SHARE_PKG_RESOURCES is a bit of a misnomer: its naming suggests that it's sharing all 'package resources' - while in reality it's specifically for sharing the LLC only. Rename it to SD_SHARE_LLC to reduce confusion. [ mingo: Rewrote the confusing changelog as well. ] Suggested-by: Valentin Schneider Signed-off-by: Alex Shi Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Ricardo Neri Reviewed-by: Barry Song Link: https://lore.kernel.org/r/20240210113924.1130448-5-alexs@kernel.org --- include/linux/sched/sd_flags.h | 4 ++-- include/linux/sched/topology.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index a8b28647aafc..b04a5d04dee9 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -117,13 +117,13 @@ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) /* - * Domain members share CPU package resources (i.e. caches) + * Domain members share CPU Last Level Caches * * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share * the same cache(s). * NEEDS_GROUPS: Caches are shared between groups. */ -SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Only a single load balancing instance diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a6e04b4a21d7..191b122158fb 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -38,21 +38,21 @@ extern const struct sd_flag_debug sd_flag_debug[]; #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { - return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; } #endif #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_LLC; } #endif #ifdef CONFIG_SCHED_MC static inline int cpu_core_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_LLC; } #endif -- cgit v1.2.3 From 3f7edeac0bbb301a07d7ea2abd28727aaa7fdab0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Dec 2023 16:13:22 +0100 Subject: SUNRPC: Add a transport callback to handle dequeuing of an RPC request Add a transport level callback to allow it to handle the consequences of dequeuing the request that was in the process of being transmitted. For something like a TCP connection, we may need to disconnect if the request was partially transmitted. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 464f6a9492ab..81b952649d35 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -152,6 +152,7 @@ struct rpc_xprt_ops { int (*prepare_request)(struct rpc_rqst *req, struct xdr_buf *buf); int (*send_request)(struct rpc_rqst *req); + void (*abort_send_request)(struct rpc_rqst *req); void (*wait_for_reply_request)(struct rpc_task *task); void (*timer)(struct rpc_xprt *xprt, struct rpc_task *task); void (*release_request)(struct rpc_task *task); -- cgit v1.2.3 From 2c35f43b5a4b9cdfaa6fdd946f5a212615dac8eb Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Tue, 30 Jan 2024 11:38:25 -0800 Subject: SUNRPC: increase size of rpc_wait_queue.qlen from unsigned short to unsigned int When the NFS client is under extreme load the rpc_wait_queue.qlen counter can be overflowed. Here is an instant of the backlog queue overflow in a real world environment shown by drgn helper: rpc_task_stats(rpc_clnt): ------------------------- rpc_clnt: 0xffff92b65d2bae00 rpc_xprt: 0xffff9275db64f000 Queue: sending[64887] pending[524] backlog[30441] binding[0] XMIT task: 0xffff925c6b1d8e98 WRITE: 750654 __dta_call_status_580: 65463 __dta_call_transmit_status_579: 1 call_reserveresult: 685189 nfs_client_init_is_complete: 1 COMMIT: 584 call_reserveresult: 573 __dta_call_status_580: 11 ACCESS: 1 __dta_call_status_580: 1 GETATTR: 10 __dta_call_status_580: 4 call_reserveresult: 6 751249 tasks for server 111.222.333.444 Total tasks: 751249 count_rpc_wait_queues(xprt): ---------------------------- **** rpc_xprt: 0xffff9275db64f000 num_reqs: 65511 wait_queue: xprt_binding[0] cnt: 0 wait_queue: xprt_binding[1] cnt: 0 wait_queue: xprt_binding[2] cnt: 0 wait_queue: xprt_binding[3] cnt: 0 rpc_wait_queue[xprt_binding].qlen: 0 maxpriority: 0 wait_queue: xprt_sending[0] cnt: 0 wait_queue: xprt_sending[1] cnt: 64887 wait_queue: xprt_sending[2] cnt: 0 wait_queue: xprt_sending[3] cnt: 0 rpc_wait_queue[xprt_sending].qlen: 64887 maxpriority: 3 wait_queue: xprt_pending[0] cnt: 524 wait_queue: xprt_pending[1] cnt: 0 wait_queue: xprt_pending[2] cnt: 0 wait_queue: xprt_pending[3] cnt: 0 rpc_wait_queue[xprt_pending].qlen: 524 maxpriority: 0 wait_queue: xprt_backlog[0] cnt: 0 wait_queue: xprt_backlog[1] cnt: 685801 wait_queue: xprt_backlog[2] cnt: 0 wait_queue: xprt_backlog[3] cnt: 0 rpc_wait_queue[xprt_backlog].qlen: 30441 maxpriority: 3 [task cnt mismatch] There is no effect on operations when this overflow occurs. However it causes confusion when trying to diagnose the performance problem. Signed-off-by: Dai Ngo Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 2d61987b3545..0c77ba488bba 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -197,7 +197,7 @@ struct rpc_wait_queue { unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */ unsigned char priority; /* current priority */ unsigned char nr; /* # tasks remaining for cookie */ - unsigned short qlen; /* total # tasks waiting in queue */ + unsigned int qlen; /* total # tasks waiting in queue */ struct rpc_timer timer_list; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) const char * name; -- cgit v1.2.3 From 62e7151ae3eb465e0ab52a20c941ff33bb6332e9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2024 16:17:51 +0100 Subject: netfilter: bridge: confirm multicast packets before passing them up the stack conntrack nf_confirm logic cannot handle cloned skbs referencing the same nf_conn entry, which will happen for multicast (broadcast) frames on bridges. Example: macvlan0 | br0 / \ ethX ethY ethX (or Y) receives a L2 multicast or broadcast packet containing an IP packet, flow is not yet in conntrack table. 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting. -> skb->_nfct now references a unconfirmed entry 2. skb is broad/mcast packet. bridge now passes clones out on each bridge interface. 3. skb gets passed up the stack. 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb and schedules a work queue to send them out on the lower devices. The clone skb->_nfct is not a copy, it is the same entry as the original skb. The macvlan rx handler then returns RX_HANDLER_PASS. 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb. The Macvlan broadcast worker and normal confirm path will race. This race will not happen if step 2 already confirmed a clone. In that case later steps perform skb_clone() with skb->_nfct already confirmed (in hash table). This works fine. But such confirmation won't happen when eb/ip/nftables rules dropped the packets before they reached the nf_confirm step in postrouting. Pablo points out that nf_conntrack_bridge doesn't allow use of stateful nat, so we can safely discard the nf_conn entry and let inet call conntrack again. This doesn't work for bridge netfilter: skb could have a nat transformation. Also bridge nf prevents re-invocation of inet prerouting via 'sabotage_in' hook. Work around this problem by explicit confirmation of the entry at LOCAL_IN time, before upper layer has a chance to clone the unconfirmed entry. The downside is that this disables NAT and conntrack helpers. Alternative fix would be to add locking to all code parts that deal with unconfirmed packets, but even if that could be done in a sane way this opens up other problems, for example: -m physdev --physdev-out eth0 -j SNAT --snat-to 1.2.3.4 -m physdev --physdev-out eth1 -j SNAT --snat-to 1.2.3.5 For multicast case, only one of such conflicting mappings will be created, conntrack only handles 1:1 NAT mappings. Users should set create a setup that explicitly marks such traffic NOTRACK (conntrack bypass) to avoid this, but we cannot auto-bypass them, ruleset might have accept rules for untracked traffic already, so user-visible behaviour would change. Suggested-by: Pablo Neira Ayuso Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217777 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 80900d910992..ce660d51549b 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -474,6 +474,7 @@ struct nf_ct_hook { const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); + int (*confirm)(struct sk_buff *skb); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; -- cgit v1.2.3 From b0cde62e4c548b2e7cb535caa6eb0df135888601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 4 Jan 2024 23:55:11 +0100 Subject: clk: Add a devm variant of clk_rate_exclusive_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to simplify drivers that use clk_rate_exclusive_get() in their probe routine as calling clk_rate_exclusive_put() is cared for automatically. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240104225512.1124519-2-u.kleine-koenig@pengutronix.de Acked-by: Russell King (Oracle) Signed-off-by: Stephen Boyd --- include/linux/clk.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index 06f1b292f8a0..24c49b01c25d 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -201,6 +201,18 @@ bool clk_is_match(const struct clk *p, const struct clk *q); */ int clk_rate_exclusive_get(struct clk *clk); +/** + * devm_clk_rate_exclusive_get - devm variant of clk_rate_exclusive_get + * @dev: device the exclusivity is bound to + * @clk: clock source + * + * Calls clk_rate_exclusive_get() on @clk and registers a devm cleanup handler + * on @dev to call clk_rate_exclusive_put(). + * + * Must not be called from within atomic context. + */ +int devm_clk_rate_exclusive_get(struct device *dev, struct clk *clk); + /** * clk_rate_exclusive_put - release exclusivity over the rate control of a * producer -- cgit v1.2.3 From 0598f8f3bb77893a13105d47bb7dfe42f1dc1f4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Feb 2024 09:24:09 +0000 Subject: inet: annotate devconf data-races Add READ_ONCE() in ipv4_devconf_get() and corresponding WRITE_ONCE() in ipv4_devconf_set() Add IPV4_DEVCONF_RO() and IPV4_DEVCONF_ALL_RO() macros, and use them when reading devconf fields. Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240227092411.2315725-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index ddb27fc0ee8c..cb5280e6cc21 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -53,13 +53,15 @@ struct in_device { }; #define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1]) +#define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr)) #define IPV4_DEVCONF_ALL(net, attr) \ IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr) +#define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr)) -static inline int ipv4_devconf_get(struct in_device *in_dev, int index) +static inline int ipv4_devconf_get(const struct in_device *in_dev, int index) { index--; - return in_dev->cnf.data[index]; + return READ_ONCE(in_dev->cnf.data[index]); } static inline void ipv4_devconf_set(struct in_device *in_dev, int index, @@ -67,7 +69,7 @@ static inline void ipv4_devconf_set(struct in_device *in_dev, int index, { index--; set_bit(index, in_dev->cnf.state); - in_dev->cnf.data[index] = val; + WRITE_ONCE(in_dev->cnf.data[index], val); } static inline void ipv4_devconf_setall(struct in_device *in_dev) @@ -81,18 +83,18 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val)) #define IN_DEV_ANDCONF(in_dev, attr) \ - (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr) && \ + (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \ IN_DEV_CONF_GET((in_dev), attr)) #define IN_DEV_NET_ORCONF(in_dev, net, attr) \ - (IPV4_DEVCONF_ALL(net, attr) || \ + (IPV4_DEVCONF_ALL_RO(net, attr) || \ IN_DEV_CONF_GET((in_dev), attr)) #define IN_DEV_ORCONF(in_dev, attr) \ IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr) #define IN_DEV_MAXCONF(in_dev, attr) \ - (max(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr), \ + (max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \ IN_DEV_CONF_GET((in_dev), attr))) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) -- cgit v1.2.3 From 99123622050f10ca9148a0fffba2de0afd6cdfff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Feb 2024 19:27:21 +0000 Subject: tcp: remove some holes in struct tcp_sock By moving some fields around, this patch shrinks holes size from 56 to 32, saving 24 bytes on 64bit arches. After the patch pahole gives the following for 'struct tcp_sock': /* size: 2304, cachelines: 36, members: 162 */ /* sum members: 2234, holes: 6, sum holes: 32 */ /* sum bitfield members: 34 bits, bit holes: 5, sum bit holes: 14 bits */ /* padding: 32 */ /* paddings: 3, sum paddings: 10 */ /* forced alignments: 1, forced holes: 1, sum forced holes: 12 */ Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240227192721.3558982-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a1c47a6d69b0..988a30ef6bfe 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -264,10 +264,10 @@ struct tcp_sock { u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lsndtime; u32 mdev_us; /* medium deviation */ + u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ - u32 rtt_seq; /* sequence number to update rttvar */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set @@ -350,7 +350,6 @@ struct tcp_sock { u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ - u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 compressed_ack_rcv_nxt; struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ @@ -384,12 +383,12 @@ struct tcp_sock { syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ + u8 keepalive_probes; /* num of allowed keep alive probes */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u8 keepalive_probes; /* num of allowed keep alive probes */ u32 reord_seen; /* number of data packet reordering events */ /* @@ -402,6 +401,7 @@ struct tcp_sock { u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ + u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; @@ -477,8 +477,8 @@ struct tcp_sock { bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) - bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ + bool (*smc_hs_congested)(const struct sock *sk); #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) -- cgit v1.2.3 From 966942ae493650210b9514f3d4bfc95f78ef0129 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 28 Feb 2024 12:28:04 +0100 Subject: gpio: nomadik: extract GPIO platform driver from drivers/pinctrl/nomadik/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, drivers/pinctrl/nomadik/pinctrl-nomadik.c registered two platform drivers: pinctrl & GPIO. Move the GPIO aspect to the drivers/gpio/ folder, as would be expected. Both drivers are intertwined for a reason; pinctrl requires access to GPIO registers for pinmuxing, pull-disable, disabling interrupts while setting the muxing and wakeup control. Information sharing is done through a shared array containing GPIO chips and a few helper functions. That shared array is not touched from gpio-nomadik when CONFIG_PINCTRL_NOMADIK is not defined. Make no change to the code that moved into gpio-nomadik; there should be no behavior change following. A few functions are shared and header comments are added. Checkpatch warnings are addressed. NUM_BANKS is renamed to NMK_MAX_BANKS. It is supported to compile gpio-nomadik without pinctrl-nomadik. The opposite is not true. Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20240228-mbly-gpio-v2-6-3ba757474006@bootlin.com Signed-off-by: Linus Walleij --- include/linux/gpio/gpio-nomadik.h | 276 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 include/linux/gpio/gpio-nomadik.h (limited to 'include/linux') diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h new file mode 100644 index 000000000000..0166ddb71f43 --- /dev/null +++ b/include/linux/gpio/gpio-nomadik.h @@ -0,0 +1,276 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_GPIO_NOMADIK_H +#define __LINUX_GPIO_NOMADIK_H + +/* Package definitions */ +#define PINCTRL_NMK_STN8815 0 +#define PINCTRL_NMK_DB8500 1 + +#define GPIO_BLOCK_SHIFT 5 +#define NMK_GPIO_PER_CHIP BIT(GPIO_BLOCK_SHIFT) +#define NMK_MAX_BANKS DIV_ROUND_UP(512, NMK_GPIO_PER_CHIP) + +/* Register in the logic block */ +#define NMK_GPIO_DAT 0x00 +#define NMK_GPIO_DATS 0x04 +#define NMK_GPIO_DATC 0x08 +#define NMK_GPIO_PDIS 0x0c +#define NMK_GPIO_DIR 0x10 +#define NMK_GPIO_DIRS 0x14 +#define NMK_GPIO_DIRC 0x18 +#define NMK_GPIO_SLPC 0x1c +#define NMK_GPIO_AFSLA 0x20 +#define NMK_GPIO_AFSLB 0x24 +#define NMK_GPIO_LOWEMI 0x28 + +#define NMK_GPIO_RIMSC 0x40 +#define NMK_GPIO_FIMSC 0x44 +#define NMK_GPIO_IS 0x48 +#define NMK_GPIO_IC 0x4c +#define NMK_GPIO_RWIMSC 0x50 +#define NMK_GPIO_FWIMSC 0x54 +#define NMK_GPIO_WKS 0x58 +/* These appear in DB8540 and later ASICs */ +#define NMK_GPIO_EDGELEVEL 0x5C +#define NMK_GPIO_LEVEL 0x60 + +/* Pull up/down values */ +enum nmk_gpio_pull { + NMK_GPIO_PULL_NONE, + NMK_GPIO_PULL_UP, + NMK_GPIO_PULL_DOWN, +}; + +/* Sleep mode */ +enum nmk_gpio_slpm { + NMK_GPIO_SLPM_INPUT, + NMK_GPIO_SLPM_WAKEUP_ENABLE = NMK_GPIO_SLPM_INPUT, + NMK_GPIO_SLPM_NOCHANGE, + NMK_GPIO_SLPM_WAKEUP_DISABLE = NMK_GPIO_SLPM_NOCHANGE, +}; + +struct nmk_gpio_chip { + struct gpio_chip chip; + void __iomem *addr; + struct clk *clk; + unsigned int bank; + void (*set_ioforce)(bool enable); + spinlock_t lock; + bool sleepmode; + /* Keep track of configured edges */ + u32 edge_rising; + u32 edge_falling; + u32 real_wake; + u32 rwimsc; + u32 fwimsc; + u32 rimsc; + u32 fimsc; + u32 pull_up; + u32 lowemi; +}; + +/* Alternate functions: function C is set in hw by setting both A and B */ +#define NMK_GPIO_ALT_GPIO 0 +#define NMK_GPIO_ALT_A 1 +#define NMK_GPIO_ALT_B 2 +#define NMK_GPIO_ALT_C (NMK_GPIO_ALT_A | NMK_GPIO_ALT_B) + +#define NMK_GPIO_ALT_CX_SHIFT 2 +#define NMK_GPIO_ALT_C1 ((1< Date: Wed, 28 Feb 2024 12:28:22 +0100 Subject: gpio: nomadik: support mobileye,eyeq5-gpio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We create a custom compatible for the STA2X11 IP block as integrated into the Mobileye EyeQ5 platform. Its wake and alternate functions have been disabled, we want to avoid touching those registers. We both do: (1) early return in functions that do not support the platform, but with warnings, and (2) avoid calling those functions in the first place. We ensure that pinctrl-nomadik is not used with this STA2X11 variant. Reviewed-by: Linus Walleij Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20240228-mbly-gpio-v2-24-3ba757474006@bootlin.com Signed-off-by: Linus Walleij --- include/linux/gpio/gpio-nomadik.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index 0166ddb71f43..9bdb09fda4c9 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -57,6 +57,7 @@ struct nmk_gpio_chip { void (*set_ioforce)(bool enable); spinlock_t lock; bool sleepmode; + bool is_mobileye_soc; /* Keep track of configured edges */ u32 edge_rising; u32 edge_falling; -- cgit v1.2.3 From f9e28904e6442019043a8e94ec6747a064d06003 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Tue, 20 Feb 2024 14:11:24 +0800 Subject: f2fs: stop checkpoint when get a out-of-bounds segment There is low probability that an out-of-bounds segment will be got on a small-capacity device. In order to prevent subsequent write requests allocating block address from this invalid segment, which may cause unexpected issue, stop checkpoint should be performed. Also introduce a new stop cp reason: STOP_CP_REASON_NO_SEGMENT. Note, f2fs_stop_checkpoint(, false) is complex and it may sleep, so we should move it outside segmap_lock spinlock coverage in get_new_segment(). Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 9b69c50255b2..755e9a41b196 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -75,6 +75,7 @@ enum stop_cp_reason { STOP_CP_REASON_CORRUPTED_SUMMARY, STOP_CP_REASON_UPDATE_INODE, STOP_CP_REASON_FLUSH_FAIL, + STOP_CP_REASON_NO_SEGMENT, STOP_CP_REASON_MAX, }; -- cgit v1.2.3 From d394abcb12bb1a6f309c1221fdb8e73594ecf1b4 Mon Sep 17 00:00:00 2001 From: Shivnandan Kumar Date: Tue, 27 Feb 2024 14:43:51 +0530 Subject: cpufreq: Limit resolving a frequency to policy min/max Resolving a frequency to an efficient one should not transgress policy->max (which can be set for thermal reason) and policy->min. Currently, there is possibility where scaling_cur_freq can exceed scaling_max_freq when scaling_max_freq is an inefficient frequency. Add a check to ensure that resolving a frequency will respect policy->min/max. Cc: All applicable Fixes: 1f39fa0dccff ("cpufreq: Introducing CPUFREQ_RELATION_E") Signed-off-by: Shivnandan Kumar [ rjw: Whitespace adjustment, changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 85908b3a2f24..692ea6e55129 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1020,6 +1020,18 @@ static inline int cpufreq_table_find_index_c(struct cpufreq_policy *policy, efficiencies); } +static inline bool cpufreq_is_in_limits(struct cpufreq_policy *policy, int idx) +{ + unsigned int freq; + + if (idx < 0) + return false; + + freq = policy->freq_table[idx].frequency; + + return freq == clamp_val(freq, policy->min, policy->max); +} + static inline int cpufreq_frequency_table_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) @@ -1053,7 +1065,8 @@ retry: return 0; } - if (idx < 0 && efficiencies) { + /* Limit frequency index to honor policy->min/max */ + if (!cpufreq_is_in_limits(policy, idx) && efficiencies) { efficiencies = false; goto retry; } -- cgit v1.2.3 From 640f41ed33b5a420e05daf395afae85e6b20c003 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Feb 2024 11:05:15 -0800 Subject: dpll: fix build failure due to rcu_dereference_check() on unknown type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tasmiya reports that their compiler complains that we deref a pointer to unknown type with rcu_dereference_rtnl(): include/linux/rcupdate.h:439:9: error: dereferencing pointer to incomplete type ‘struct dpll_pin’ Unclear what compiler it is, at the moment, and we can't report but since DPLL can't be a module - move the code from the header into the source file. Fixes: 0d60d8df6f49 ("dpll: rely on rcu for netdev_dpll_pin()") Reported-by: Tasmiya Nalatwad Link: https://lore.kernel.org/all/3fcf3a2c-1c1b-42c1-bacb-78fdcd700389@linux.vnet.ibm.com/ Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240229190515.2740221-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 4ec2fe9caf5a..c60591308ae8 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -169,13 +169,13 @@ int dpll_device_change_ntf(struct dpll_device *dpll); int dpll_pin_change_ntf(struct dpll_pin *pin); +#if !IS_ENABLED(CONFIG_DPLL) static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) { -#if IS_ENABLED(CONFIG_DPLL) - return rcu_dereference_rtnl(dev->dpll_pin); -#else return NULL; -#endif } +#else +struct dpll_pin *netdev_dpll_pin(const struct net_device *dev); +#endif #endif -- cgit v1.2.3 From 61a182ab61a6dbb8a8782d52347168dc2aa0aa0a Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Thu, 29 Feb 2024 22:20:06 +0800 Subject: cgroup/cpuset: Remove cpuset_do_slab_mem_spread() The SLAB allocator has been removed sine 6.8-rc1 [1], so there is no user with SLAB_MEM_SPREAD and cpuset_do_slab_mem_spread(). Then SLAB_MEM_SPREAD is marked as unused by [2]. Here we can remove cpuset_do_slab_mem_spread(). For more details, please check [3]. [1] https://lore.kernel.org/linux-mm/20231120-slab-remove-slab-v2-0-9c9c70177183@suse.cz/ [2] https://lore.kernel.org/linux-kernel/20240223-slab-cleanup-flags-v2-0-02f1753e8303@suse.cz/T/ [3] https://lore.kernel.org/lkml/32bc1403-49da-445a-8c00-9686a3b0d6a3@redhat.com/T/#mf14b838c5e0e77f4756d436bac3d8c0447ea4350 Signed-off-by: Xiongwei Song Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 875d12598bd2..0ce6ff0d9c9a 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -121,11 +121,6 @@ static inline int cpuset_do_page_mem_spread(void) return task_spread_page(current); } -static inline int cpuset_do_slab_mem_spread(void) -{ - return task_spread_slab(current); -} - extern bool current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); @@ -264,11 +259,6 @@ static inline int cpuset_do_page_mem_spread(void) return 0; } -static inline int cpuset_do_slab_mem_spread(void) -{ - return 0; -} - static inline bool current_cpuset_is_being_rebound(void) { return false; -- cgit v1.2.3 From adeb04362d74188c1e22ccb824b15a0a7b3de2f4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 14 Feb 2024 19:26:32 +0200 Subject: kernel.h: Move upper_*_bits() and lower_*_bits() to wordpart.h The wordpart.h header is collecting APIs related to the handling parts of the word (usually in byte granularity). The upper_*_bits() and lower_*_bits() are good candidates to be moved to there. This helps to clean up header dependency hell with regard to kernel.h as the latter gathers completely unrelated stuff together and slows down compilation (especially when it's included into other header). Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240214172752.3605073-1-andriy.shevchenko@linux.intel.com Reviewed-by: Randy Dunlap Signed-off-by: Kees Cook --- include/linux/kernel.h | 30 ++---------------------------- include/linux/wordpart.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f4a1d582b79d..86dd8939c2cd 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -33,6 +33,8 @@ #include #include #include +#include + #include #include @@ -52,34 +54,6 @@ } \ ) -/** - * upper_32_bits - return bits 32-63 of a number - * @n: the number we're accessing - * - * A basic shift-right of a 64- or 32-bit quantity. Use this to suppress - * the "right shift count >= width of type" warning when that quantity is - * 32-bits. - */ -#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16)) - -/** - * lower_32_bits - return bits 0-31 of a number - * @n: the number we're accessing - */ -#define lower_32_bits(n) ((u32)((n) & 0xffffffff)) - -/** - * upper_16_bits - return bits 16-31 of a number - * @n: the number we're accessing - */ -#define upper_16_bits(n) ((u16)((n) >> 16)) - -/** - * lower_16_bits - return bits 0-15 of a number - * @n: the number we're accessing - */ -#define lower_16_bits(n) ((u16)((n) & 0xffff)) - struct completion; struct user; diff --git a/include/linux/wordpart.h b/include/linux/wordpart.h index c9e6bd773ebd..f6f8f83b15b0 100644 --- a/include/linux/wordpart.h +++ b/include/linux/wordpart.h @@ -2,6 +2,35 @@ #ifndef _LINUX_WORDPART_H #define _LINUX_WORDPART_H + +/** + * upper_32_bits - return bits 32-63 of a number + * @n: the number we're accessing + * + * A basic shift-right of a 64- or 32-bit quantity. Use this to suppress + * the "right shift count >= width of type" warning when that quantity is + * 32-bits. + */ +#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16)) + +/** + * lower_32_bits - return bits 0-31 of a number + * @n: the number we're accessing + */ +#define lower_32_bits(n) ((u32)((n) & 0xffffffff)) + +/** + * upper_16_bits - return bits 16-31 of a number + * @n: the number we're accessing + */ +#define upper_16_bits(n) ((u16)((n) >> 16)) + +/** + * lower_16_bits - return bits 0-15 of a number + * @n: the number we're accessing + */ +#define lower_16_bits(n) ((u16)((n) & 0xffff)) + /** * REPEAT_BYTE - repeat the value @x multiple times as an unsigned long value * @x: value to repeat -- cgit v1.2.3 From 3e19086fb5a9079611de426e8cb2f4503e28757e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 29 Jan 2024 10:21:58 -0800 Subject: overflow: Adjust check_*_overflow() kern-doc to reflect results The check_*_overflow() helpers will return results with potentially wrapped-around values. These values have always been checked by the selftests, so avoid the confusing language in the kern-doc. The idea of "safe for use" was relative to the expectation of whether or not the caller wants a wrapped value -- the calculation itself will always follow arithmetic wrapping rules. Reviewed-by: Gustavo A. R. Silva Acked-by: Mark Rutland Signed-off-by: Kees Cook --- include/linux/overflow.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 7b5cf4a5cd19..ad64d810c8aa 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -57,11 +57,10 @@ static inline bool __must_check __must_check_overflow(bool overflow) * @b: second addend * @d: pointer to store sum * - * Returns 0 on success. + * Returns true on wrap-around, false otherwise. * - * *@d holds the results of the attempted addition, but is not considered - * "safe for use" on a non-zero return value, which indicates that the - * sum has overflowed or been truncated. + * *@d holds the results of the attempted addition, regardless of whether + * wrap-around occurred. */ #define check_add_overflow(a, b, d) \ __must_check_overflow(__builtin_add_overflow(a, b, d)) @@ -72,11 +71,10 @@ static inline bool __must_check __must_check_overflow(bool overflow) * @b: subtrahend; value to subtract from @a * @d: pointer to store difference * - * Returns 0 on success. + * Returns true on wrap-around, false otherwise. * - * *@d holds the results of the attempted subtraction, but is not considered - * "safe for use" on a non-zero return value, which indicates that the - * difference has underflowed or been truncated. + * *@d holds the results of the attempted subtraction, regardless of whether + * wrap-around occurred. */ #define check_sub_overflow(a, b, d) \ __must_check_overflow(__builtin_sub_overflow(a, b, d)) @@ -87,11 +85,10 @@ static inline bool __must_check __must_check_overflow(bool overflow) * @b: second factor * @d: pointer to store product * - * Returns 0 on success. + * Returns true on wrap-around, false otherwise. * - * *@d holds the results of the attempted multiplication, but is not - * considered "safe for use" on a non-zero return value, which indicates - * that the product has overflowed or been truncated. + * *@d holds the results of the attempted multiplication, regardless of whether + * wrap-around occurred. */ #define check_mul_overflow(a, b, d) \ __must_check_overflow(__builtin_mul_overflow(a, b, d)) -- cgit v1.2.3 From d70de8054c58d7bd9a4654c9f4797d29fa46d545 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Jan 2024 16:05:52 -0800 Subject: overflow: Introduce wrapping_add(), wrapping_sub(), and wrapping_mul() Provide helpers that will perform wrapping addition, subtraction, or multiplication without tripping the arithmetic wrap-around sanitizers. The first argument is the type under which the wrap-around should happen with. In other words, these two calls will get very different results: wrapping_mul(int, 50, 50) == 2500 wrapping_mul(u8, 50, 50) == 196 Add to the selftests to validate behavior and lack of side-effects. Reviewed-by: Gustavo A. R. Silva Reviewed-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Kees Cook --- include/linux/overflow.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index ad64d810c8aa..d3ff8e2bec29 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -65,6 +65,22 @@ static inline bool __must_check __must_check_overflow(bool overflow) #define check_add_overflow(a, b, d) \ __must_check_overflow(__builtin_add_overflow(a, b, d)) +/** + * wrapping_add() - Intentionally perform a wrapping addition + * @type: type for result of calculation + * @a: first addend + * @b: second addend + * + * Return the potentially wrapped-around addition without + * tripping any wrap-around sanitizers that may be enabled. + */ +#define wrapping_add(type, a, b) \ + ({ \ + type __val; \ + __builtin_add_overflow(a, b, &__val); \ + __val; \ + }) + /** * check_sub_overflow() - Calculate subtraction with overflow checking * @a: minuend; value to subtract from @@ -79,6 +95,22 @@ static inline bool __must_check __must_check_overflow(bool overflow) #define check_sub_overflow(a, b, d) \ __must_check_overflow(__builtin_sub_overflow(a, b, d)) +/** + * wrapping_sub() - Intentionally perform a wrapping subtraction + * @type: type for result of calculation + * @a: minuend; value to subtract from + * @b: subtrahend; value to subtract from @a + * + * Return the potentially wrapped-around subtraction without + * tripping any wrap-around sanitizers that may be enabled. + */ +#define wrapping_sub(type, a, b) \ + ({ \ + type __val; \ + __builtin_sub_overflow(a, b, &__val); \ + __val; \ + }) + /** * check_mul_overflow() - Calculate multiplication with overflow checking * @a: first factor @@ -93,6 +125,22 @@ static inline bool __must_check __must_check_overflow(bool overflow) #define check_mul_overflow(a, b, d) \ __must_check_overflow(__builtin_mul_overflow(a, b, d)) +/** + * wrapping_mul() - Intentionally perform a wrapping multiplication + * @type: type for result of calculation + * @a: first factor + * @b: second factor + * + * Return the potentially wrapped-around multiplication without + * tripping any wrap-around sanitizers that may be enabled. + */ +#define wrapping_mul(type, a, b) \ + ({ \ + type __val; \ + __builtin_mul_overflow(a, b, &__val); \ + __val; \ + }) + /** * check_shl_overflow() - Calculate a left-shifted value and check overflow * @a: Value to be shifted -- cgit v1.2.3 From 08d45ee84bb2650e237e150caca87cc4ded9b3e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 26 Jan 2024 22:09:50 -0800 Subject: overflow: Introduce wrapping_assign_add() and wrapping_assign_sub() This allows replacements of the idioms "var += offset" and "var -= offset" with the wrapping_assign_add() and wrapping_assign_sub() helpers respectively. They will avoid wrap-around sanitizer instrumentation. Add to the selftests to validate behavior and lack of side-effects. Reviewed-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Kees Cook --- include/linux/overflow.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index d3ff8e2bec29..dede374832c9 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -81,6 +81,22 @@ static inline bool __must_check __must_check_overflow(bool overflow) __val; \ }) +/** + * wrapping_assign_add() - Intentionally perform a wrapping increment assignment + * @var: variable to be incremented + * @offset: amount to add + * + * Increments @var by @offset with wrap-around. Returns the resulting + * value of @var. Will not trip any wrap-around sanitizers. + * + * Returns the new value of @var. + */ +#define wrapping_assign_add(var, offset) \ + ({ \ + typeof(var) *__ptr = &(var); \ + *__ptr = wrapping_add(typeof(var), *__ptr, offset); \ + }) + /** * check_sub_overflow() - Calculate subtraction with overflow checking * @a: minuend; value to subtract from @@ -111,6 +127,22 @@ static inline bool __must_check __must_check_overflow(bool overflow) __val; \ }) +/** + * wrapping_assign_sub() - Intentionally perform a wrapping decrement assign + * @var: variable to be decremented + * @offset: amount to subtract + * + * Decrements @var by @offset with wrap-around. Returns the resulting + * value of @var. Will not trip any wrap-around sanitizers. + * + * Returns the new value of @var. + */ +#define wrapping_assign_sub(var, offset) \ + ({ \ + typeof(var) *__ptr = &(var); \ + *__ptr = wrapping_sub(typeof(var), *__ptr, offset); \ + }) + /** * check_mul_overflow() - Calculate multiplication with overflow checking * @a: first factor -- cgit v1.2.3 From 9ca5facd0400f610f3f7f71aeb7fc0b949a48c67 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Wed, 14 Feb 2024 17:50:15 +0100 Subject: lib/string_choices: Add str_plural() helper Add str_plural() helper to replace existing open implementations used by many drivers and help improve future user facing messages. Signed-off-by: Michal Wajdeczko Link: https://lore.kernel.org/r/20240214165015.1656-1-michal.wajdeczko@intel.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index 3c1091941eb8..d9ebe20229f8 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -42,4 +42,15 @@ static inline const char *str_yes_no(bool v) return v ? "yes" : "no"; } +/** + * str_plural - Return the simple pluralization based on English counts + * @num: Number used for deciding pluralization + * + * If @num is 1, returns empty string, otherwise returns "s". + */ +static inline const char *str_plural(size_t num) +{ + return num == 1 ? "" : "s"; +} + #endif -- cgit v1.2.3 From 99db710f768e988e70f1164537bf533a017be24d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 20 Feb 2024 21:16:38 -0800 Subject: refcount: Annotated intentional signed integer wrap-around Mark the various refcount_t functions with __signed_wrap, as we depend on the wrapping behavior to detect the overflow and perform saturation. Silences warnings seen with the LKDTM REFCOUNT_* tests: UBSAN: signed-integer-overflow in ../include/linux/refcount.h:189:11 2147483647 + 1 cannot be represented in type 'int' Reviewed-by: Miguel Ojeda Link: https://lore.kernel.org/r/20240221051634.work.287-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/refcount.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 85c6df0d1bef..59b3b752394d 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -136,7 +136,8 @@ static inline unsigned int refcount_read(const refcount_t *r) return atomic_read(&r->refs); } -static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) +static inline __must_check __signed_wrap +bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); @@ -177,7 +178,8 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) return __refcount_add_not_zero(i, r, NULL); } -static inline void __refcount_add(int i, refcount_t *r, int *oldp) +static inline __signed_wrap +void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); @@ -256,7 +258,8 @@ static inline void refcount_inc(refcount_t *r) __refcount_inc(r, NULL); } -static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) +static inline __must_check __signed_wrap +bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); -- cgit v1.2.3 From 475ddf1fce1ec4826c8dda40ec59f7f83a7aadb8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 7 Apr 2023 12:27:13 -0700 Subject: fortify: Split reporting and avoid passing string pointer In preparation for KUnit testing and further improvements in fortify failure reporting, split out the report and encode the function and access failure (read or write overflow) into a single u8 argument. This mainly ends up saving a tiny bit of space in the data segment. For a defconfig with FORTIFY_SOURCE enabled: $ size gcc/vmlinux.before gcc/vmlinux.after text data bss dec hex filename 26132309 9760658 2195460 38088427 2452eeb gcc/vmlinux.before 26132386 9748382 2195460 38076228 244ff44 gcc/vmlinux.after Reviewed-by: Alexander Lobakin Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 81 +++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 06b3aaa63724..4f6767dcd933 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FORTIFY_STRING_H_ #define _LINUX_FORTIFY_STRING_H_ +#include #include #include #include @@ -9,7 +10,44 @@ #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable #define __RENAME(x) __asm__(#x) -void fortify_panic(const char *name) __noreturn __cold; +#define FORTIFY_REASON_DIR(r) FIELD_GET(BIT(0), r) +#define FORTIFY_REASON_FUNC(r) FIELD_GET(GENMASK(7, 1), r) +#define FORTIFY_REASON(func, write) (FIELD_PREP(BIT(0), write) | \ + FIELD_PREP(GENMASK(7, 1), func)) + +#define fortify_panic(func, write) \ + __fortify_panic(FORTIFY_REASON(func, write)) + +#define FORTIFY_READ 0 +#define FORTIFY_WRITE 1 + +#define EACH_FORTIFY_FUNC(macro) \ + macro(strncpy), \ + macro(strnlen), \ + macro(strlen), \ + macro(strscpy), \ + macro(strlcat), \ + macro(strcat), \ + macro(strncat), \ + macro(memset), \ + macro(memcpy), \ + macro(memmove), \ + macro(memscan), \ + macro(memcmp), \ + macro(memchr), \ + macro(memchr_inv), \ + macro(kmemdup), \ + macro(strcpy), \ + macro(UNKNOWN), + +#define MAKE_FORTIFY_FUNC(func) FORTIFY_FUNC_##func + +enum fortify_func { + EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC) +}; + +void __fortify_report(const u8 reason); +void __fortify_panic(const u8 reason) __cold __noreturn; void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?"); @@ -143,7 +181,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE); return __underlying_strncpy(p, q, size); } @@ -174,7 +212,7 @@ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size /* Do not check characters beyond the end of p. */ ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ); return ret; } @@ -210,7 +248,7 @@ __kernel_size_t __fortify_strlen(const char * const POS p) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ); return ret; } @@ -261,7 +299,7 @@ __FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const PO * p_size. */ if (len > p_size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE); /* * We can now safely call vanilla strscpy because we are protected from: @@ -319,7 +357,7 @@ size_t strlcat(char * const POS p, const char * const POS q, size_t avail) /* Give up if string is already overflowed. */ if (p_size <= p_len) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ); if (actual >= avail) { copy_len = avail - p_len - 1; @@ -328,7 +366,7 @@ size_t strlcat(char * const POS p, const char * const POS q, size_t avail) /* Give up if copy will overflow. */ if (p_size <= actual) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE); __underlying_memcpy(p + p_len, q, copy_len); p[actual] = '\0'; @@ -357,7 +395,7 @@ char *strcat(char * const POS p, const char *q) const size_t p_size = __member_size(p); if (strlcat(p, q, p_size) >= p_size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE); return p; } @@ -393,7 +431,7 @@ char *strncat(char * const POS p, const char * const POS q, __kernel_size_t coun p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; @@ -434,7 +472,7 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) - fortify_panic("memset"); + fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE); } #define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({ \ @@ -488,7 +526,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t q_size, const size_t p_size_field, const size_t q_size_field, - const char *func) + const u8 func) { if (__builtin_constant_p(size)) { /* @@ -532,9 +570,10 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ - if ((p_size != SIZE_MAX && p_size < size) || - (q_size != SIZE_MAX && q_size < size)) - fortify_panic(func); + if (p_size != SIZE_MAX && p_size < size) + fortify_panic(func, FORTIFY_WRITE); + else if (q_size != SIZE_MAX && q_size < size) + fortify_panic(func, FORTIFY_READ); /* * Warn when writing beyond destination field size. @@ -567,7 +606,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t __q_size_field = (q_size_field); \ WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ - __q_size_field, #op), \ + __q_size_field, FORTIFY_FUNC_ ##op), \ #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ @@ -634,7 +673,7 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ); return __real_memscan(p, c, size); } @@ -651,7 +690,7 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t __read_overflow2(); } if (p_size < size || q_size < size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ); return __underlying_memcmp(p, q, size); } @@ -663,7 +702,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ); return __underlying_memchr(p, c, size); } @@ -675,7 +714,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ); return __real_memchr_inv(p, c, size); } @@ -688,7 +727,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ); return __real_kmemdup(p, size, gfp); } @@ -725,7 +764,7 @@ char *strcpy(char * const POS p, const char * const POS q) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) - fortify_panic(__func__); + fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE); __underlying_memcpy(p, q, size); return p; } -- cgit v1.2.3 From 4ce615e798a752d4431fcc52960478906dec2f0e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 7 Apr 2023 12:27:14 -0700 Subject: fortify: Provide KUnit counters for failure testing The standard C string APIs were not designed to have a failure mode; they were expected to always succeed without memory safety issues. Normally, CONFIG_FORTIFY_SOURCE will use fortify_panic() to stop processing, as truncating a read or write may provide an even worse system state. However, this creates a problem for testing under things like KUnit, which needs a way to survive failures. When building with CONFIG_KUNIT, provide a failure path for all users of fortify_panic, and track whether the failure was a read overflow or a write overflow, for KUnit tests to examine. Inspired by similar logic in the slab tests. Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 43 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 4f6767dcd933..fbfb90479b8f 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -15,8 +15,10 @@ #define FORTIFY_REASON(func, write) (FIELD_PREP(BIT(0), write) | \ FIELD_PREP(GENMASK(7, 1), func)) -#define fortify_panic(func, write) \ - __fortify_panic(FORTIFY_REASON(func, write)) +#ifndef fortify_panic +# define fortify_panic(func, write, retfail) \ + __fortify_panic(FORTIFY_REASON(func, write)) +#endif #define FORTIFY_READ 0 #define FORTIFY_WRITE 1 @@ -181,7 +183,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE); + fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p); return __underlying_strncpy(p, q, size); } @@ -212,7 +214,7 @@ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size /* Do not check characters beyond the end of p. */ ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) - fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ); + fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, ret); return ret; } @@ -248,7 +250,7 @@ __kernel_size_t __fortify_strlen(const char * const POS p) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) - fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ); + fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, ret); return ret; } @@ -299,7 +301,7 @@ __FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const PO * p_size. */ if (len > p_size) - fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE); + fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, -E2BIG); /* * We can now safely call vanilla strscpy because we are protected from: @@ -357,7 +359,7 @@ size_t strlcat(char * const POS p, const char * const POS q, size_t avail) /* Give up if string is already overflowed. */ if (p_size <= p_len) - fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ); + fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, wanted); if (actual >= avail) { copy_len = avail - p_len - 1; @@ -366,7 +368,7 @@ size_t strlcat(char * const POS p, const char * const POS q, size_t avail) /* Give up if copy will overflow. */ if (p_size <= actual) - fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE); + fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, wanted); __underlying_memcpy(p + p_len, q, copy_len); p[actual] = '\0'; @@ -395,7 +397,7 @@ char *strcat(char * const POS p, const char *q) const size_t p_size = __member_size(p); if (strlcat(p, q, p_size) >= p_size) - fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE); + fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p); return p; } @@ -431,13 +433,13 @@ char *strncat(char * const POS p, const char * const POS q, __kernel_size_t coun p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) - fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE); + fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } -__FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, +__FORTIFY_INLINE bool fortify_memset_chk(__kernel_size_t size, const size_t p_size, const size_t p_size_field) { @@ -472,7 +474,8 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) - fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE); + fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, true); + return false; } #define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({ \ @@ -571,9 +574,9 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) - fortify_panic(func, FORTIFY_WRITE); + fortify_panic(func, FORTIFY_WRITE, true); else if (q_size != SIZE_MAX && q_size < size) - fortify_panic(func, FORTIFY_READ); + fortify_panic(func, FORTIFY_READ, true); /* * Warn when writing beyond destination field size. @@ -673,7 +676,7 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ); + fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, NULL); return __real_memscan(p, c, size); } @@ -690,7 +693,7 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t __read_overflow2(); } if (p_size < size || q_size < size) - fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ); + fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, INT_MIN); return __underlying_memcmp(p, q, size); } @@ -702,7 +705,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ); + fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, NULL); return __underlying_memchr(p, c, size); } @@ -714,7 +717,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ); + fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, NULL); return __real_memchr_inv(p, c, size); } @@ -727,7 +730,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ); + fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, NULL); return __real_kmemdup(p, size, gfp); } @@ -764,7 +767,7 @@ char *strcpy(char * const POS p, const char * const POS q) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) - fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE); + fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p); __underlying_memcpy(p, q, size); return p; } -- cgit v1.2.3 From 3d965b33e40d973b450cb0212913f039476c16f4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 7 Apr 2023 12:27:16 -0700 Subject: fortify: Improve buffer overflow reporting Improve the reporting of buffer overflows under CONFIG_FORTIFY_SOURCE to help accelerate debugging efforts. The calculations are all just sitting in registers anyway, so pass them along to the function to be reported. For example, before: detected buffer overflow in memcpy and after: memcpy: detected buffer overflow: 4096 byte read of buffer size 1 Link: https://lore.kernel.org/r/20230407192717.636137-10-keescook@chromium.org Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 56 ++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index fbfb90479b8f..6aeebe0a6777 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -16,8 +16,8 @@ FIELD_PREP(GENMASK(7, 1), func)) #ifndef fortify_panic -# define fortify_panic(func, write, retfail) \ - __fortify_panic(FORTIFY_REASON(func, write)) +# define fortify_panic(func, write, avail, size, retfail) \ + __fortify_panic(FORTIFY_REASON(func, write), avail, size) #endif #define FORTIFY_READ 0 @@ -48,8 +48,8 @@ enum fortify_func { EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC) }; -void __fortify_report(const u8 reason); -void __fortify_panic(const u8 reason) __cold __noreturn; +void __fortify_report(const u8 reason, const size_t avail, const size_t size); +void __fortify_panic(const u8 reason, const size_t avail, const size_t size) __cold __noreturn; void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?"); @@ -183,7 +183,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p); + fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p_size, size, p); return __underlying_strncpy(p, q, size); } @@ -214,7 +214,7 @@ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size /* Do not check characters beyond the end of p. */ ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) - fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, ret); + fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, p_size, ret + 1, ret); return ret; } @@ -250,7 +250,7 @@ __kernel_size_t __fortify_strlen(const char * const POS p) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) - fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, ret); + fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, p_size, ret + 1, ret); return ret; } @@ -300,8 +300,8 @@ __FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const PO * Generate a runtime write overflow error if len is greater than * p_size. */ - if (len > p_size) - fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, -E2BIG); + if (p_size < len) + fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, p_size, len, -E2BIG); /* * We can now safely call vanilla strscpy because we are protected from: @@ -359,7 +359,7 @@ size_t strlcat(char * const POS p, const char * const POS q, size_t avail) /* Give up if string is already overflowed. */ if (p_size <= p_len) - fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, wanted); + fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, p_size, p_len + 1, wanted); if (actual >= avail) { copy_len = avail - p_len - 1; @@ -368,7 +368,7 @@ size_t strlcat(char * const POS p, const char * const POS q, size_t avail) /* Give up if copy will overflow. */ if (p_size <= actual) - fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, wanted); + fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, p_size, actual + 1, wanted); __underlying_memcpy(p + p_len, q, copy_len); p[actual] = '\0'; @@ -395,9 +395,10 @@ __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { const size_t p_size = __member_size(p); + const size_t wanted = strlcat(p, q, p_size); - if (strlcat(p, q, p_size) >= p_size) - fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p); + if (p_size <= wanted) + fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p_size, wanted + 1, p); return p; } @@ -426,14 +427,15 @@ char *strncat(char * const POS p, const char * const POS q, __kernel_size_t coun { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); - size_t p_len, copy_len; + size_t p_len, copy_len, total; if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); - if (p_size < p_len + copy_len + 1) - fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p); + total = p_len + copy_len + 1; + if (p_size < total) + fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p_size, total, p); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; @@ -474,7 +476,7 @@ __FORTIFY_INLINE bool fortify_memset_chk(__kernel_size_t size, * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) - fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, true); + fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, p_size, size, true); return false; } @@ -574,9 +576,9 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) - fortify_panic(func, FORTIFY_WRITE, true); + fortify_panic(func, FORTIFY_WRITE, p_size, size, true); else if (q_size != SIZE_MAX && q_size < size) - fortify_panic(func, FORTIFY_READ, true); + fortify_panic(func, FORTIFY_READ, p_size, size, true); /* * Warn when writing beyond destination field size. @@ -676,7 +678,7 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, NULL); + fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, p_size, size, NULL); return __real_memscan(p, c, size); } @@ -692,8 +694,10 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t if (__compiletime_lessthan(q_size, size)) __read_overflow2(); } - if (p_size < size || q_size < size) - fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, INT_MIN); + if (p_size < size) + fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, p_size, size, INT_MIN); + else if (q_size < size) + fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, q_size, size, INT_MIN); return __underlying_memcmp(p, q, size); } @@ -705,7 +709,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, NULL); + fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, p_size, size, NULL); return __underlying_memchr(p, c, size); } @@ -717,7 +721,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, NULL); + fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, p_size, size, NULL); return __real_memchr_inv(p, c, size); } @@ -730,7 +734,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) - fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, NULL); + fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size, NULL); return __real_kmemdup(p, size, gfp); } @@ -767,7 +771,7 @@ char *strcpy(char * const POS p, const char * const POS q) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) - fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p); + fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p_size, size, p); __underlying_memcpy(p, q, size); return p; } -- cgit v1.2.3 From 57914905f3ff2212a949e7191d52d9994c2c6215 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 3 Oct 2023 16:01:42 +0300 Subject: kernel.h: Move lib/cmdline.c prototypes to string.h The lib/cmdline.c is basically a set of some small string parsers which are wide used in the kernel. Their prototypes belong to the string.h rather then kernel.h. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231003130142.2936503-1-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- include/linux/kernel.h | 6 ------ include/linux/string.h | 8 ++++++++ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 86dd8939c2cd..d718fbec72dd 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -165,12 +165,6 @@ static inline void might_fault(void) { } void do_exit(long error_code) __noreturn; -extern int get_option(char **str, int *pint); -extern char *get_options(const char *str, int nints, int *ints); -extern unsigned long long memparse(const char *ptr, char **retptr); -extern bool parse_option_str(const char *str, const char *option); -extern char *next_arg(char *args, char **param, char **val); - extern int core_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); diff --git a/include/linux/string.h b/include/linux/string.h index 96e6b1af86b5..adf3b3eb0ab7 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -286,9 +286,17 @@ extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); +/* lib/argv_split.c */ extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); +/* lib/cmdline.c */ +extern int get_option(char **str, int *pint); +extern char *get_options(const char *str, int nints, int *ints); +extern unsigned long long memparse(const char *ptr, char **retptr); +extern bool parse_option_str(const char *str, const char *option); +extern char *next_arg(char *args, char **param, char **val); + extern bool sysfs_streq(const char *s1, const char *s2); int match_string(const char * const *array, size_t n, const char *string); int __sysfs_match_string(const char * const *array, size_t n, const char *s); -- cgit v1.2.3 From c5e6d3d85efa7451590edd94725b4b280e2fd8a3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 28 Feb 2024 22:41:31 +0200 Subject: overflow: Use POD in check_shl_overflow() The check_shl_overflow() uses u64 type that is defined in types.h. Instead of including that header, just switch to use POD type directly. Signed-off-by: Andy Shevchenko Acked-by: Kees Cook Link: https://lore.kernel.org/r/20240228204919.3680786-2-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- include/linux/overflow.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index dede374832c9..bc390f026128 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -197,7 +197,7 @@ static inline bool __must_check __must_check_overflow(bool overflow) typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ - u64 _a_full = _a; \ + unsigned long long _a_full = _a; \ unsigned int _to_shift = \ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ *_d = (_a_full << _to_shift); \ -- cgit v1.2.3 From 1acd92d95fa24edca8f0292b21870025da93e24f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Feb 2024 15:38:55 -1000 Subject: workqueue: Drain BH work items on hot-unplugged CPUs Boqun pointed out that workqueues aren't handling BH work items on offlined CPUs. Unlike tasklet which transfers out the pending tasks from CPUHP_SOFTIRQ_DEAD, BH workqueue would just leave them pending which is problematic. Note that this behavior is specific to BH workqueues as the non-BH per-CPU workers just become unbound when the CPU goes offline. This patch fixes the issue by draining the pending BH work items from an offlined CPU from CPUHP_SOFTIRQ_DEAD. Because work items carry more context, it's not as easy to transfer the pending work items from one pool to another. Instead, run BH work items which execute the offlined pools on an online CPU. Note that this assumes that no further BH work items will be queued on the offlined CPUs. This assumption is shared with tasklet and should be fine for conversions. However, this issue also exists for per-CPU workqueues which will just keep executing work items queued after CPU offline on unbound workers and workqueue should reject per-CPU and BH work items queued on offline CPUs. This will be addressed separately later. Signed-off-by: Tejun Heo Reported-and-reviewed-by: Boqun Feng Link: http://lkml.kernel.org/r/Zdvw0HdSXcU3JZ4g@boqun-archlinux --- include/linux/workqueue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 64a60b9232d3..158784dd189a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -458,6 +458,7 @@ extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; void workqueue_softirq_action(bool highpri); +void workqueue_softirq_dead(unsigned int cpu); /** * alloc_workqueue - allocate a workqueue -- cgit v1.2.3 From f0b7f8ade9d2532a7d7da40eb297570d48dd2147 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 29 Feb 2024 22:52:30 +0200 Subject: lib/string_helpers: Add flags param to string_get_size() The new flags parameter allows controlling - Whether or not the units suffix is separated by a space, for compatibility with sort -h - Whether or not to append a B suffix - we're not always printing bytes. Co-developed-by: Kent Overstreet Signed-off-by: Kent Overstreet Signed-off-by: Andy Shevchenko Reviewed-by: Kent Overstreet Link: https://lore.kernel.org/r/20240229205345.93902-1-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- include/linux/string_helpers.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 58fb1f90eda5..e93fbb5b0c01 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -17,14 +17,18 @@ static inline bool string_is_terminated(const char *s, int len) return memchr(s, '\0', len) ? true : false; } -/* Descriptions of the types of units to - * print in */ +/* Descriptions of the types of units to print in */ enum string_size_units { STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ STRING_UNITS_2, /* use binary powers of 2^10 */ + STRING_UNITS_MASK = BIT(0), + + /* Modifiers */ + STRING_UNITS_NO_SPACE = BIT(30), + STRING_UNITS_NO_BYTES = BIT(31), }; -int string_get_size(u64 size, u64 blk_size, enum string_size_units units, +int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, char *buf, int len); int parse_int_array_user(const char __user *from, size_t count, int **array); -- cgit v1.2.3 From b8209544296edbd1af186e2ea9c648642c37b18c Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 28 Feb 2024 16:45:33 -0800 Subject: Drivers: hv: vmbus: Calculate ring buffer size for more efficient use of memory The VMBUS_RING_SIZE macro adds space for a ring buffer header to the requested ring buffer size. The header size is always 1 page, and so its size varies based on the PAGE_SIZE for which the kernel is built. If the requested ring buffer size is a large power-of-2 size and the header size is small, the resulting size is inefficient in its use of memory. For example, a 512 Kbyte ring buffer with a 4 Kbyte page size results in a 516 Kbyte allocation, which is rounded to up 1 Mbyte by the memory allocator, and wastes 508 Kbytes of memory. In such situations, the exact size of the ring buffer isn't that important, and it's OK to allocate the 4 Kbyte header at the beginning of the 512 Kbytes, leaving the ring buffer itself with just 508 Kbytes. The memory allocation can be 512 Kbytes instead of 1 Mbyte and nothing is wasted. Update VMBUS_RING_SIZE to implement this approach for "large" ring buffer sizes. "Large" is somewhat arbitrarily defined as 8 times the size of the ring buffer header (which is of size PAGE_SIZE). For example, for 4 Kbyte PAGE_SIZE, ring buffers of 32 Kbytes and larger use the first 4 Kbytes as the ring buffer header. For 64 Kbyte PAGE_SIZE, ring buffers of 512 Kbytes and larger use the first 64 Kbytes as the ring buffer header. In both cases, smaller sizes add space for the header so the ring size isn't reduced too much by using part of the space for the header. For example, with a 64 Kbyte page size, we don't want a 128 Kbyte ring buffer to be reduced to 64 Kbytes by allocating half of the space for the header. In such a case, the memory allocation is less efficient, but it's the best that can be done. While the new algorithm slightly changes the amount of space allocated for ring buffers by drivers that use VMBUS_RING_SIZE, the devices aren't known to be sensitive to small changes in ring buffer size, so there shouldn't be any effect. Fixes: c1135c7fd0e9 ("Drivers: hv: vmbus: Introduce types of GPADL") Fixes: 6941f67ad37d ("hv_netvsc: Calculate correct ring size when PAGE_SIZE is not 4 Kbytes") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218502 Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Reviewed-by: Saurabh Sengar Reviewed-by: Dexuan Cui Tested-by: Souradeep Chakrabarti Link: https://lore.kernel.org/r/20240229004533.313662-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240229004533.313662-1-mhklinux@outlook.com> --- include/linux/hyperv.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2b00faf98017..6ef0557b4bff 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -164,8 +164,28 @@ struct hv_ring_buffer { u8 buffer[]; } __packed; + +/* + * If the requested ring buffer size is at least 8 times the size of the + * header, steal space from the ring buffer for the header. Otherwise, add + * space for the header so that is doesn't take too much of the ring buffer + * space. + * + * The factor of 8 is somewhat arbitrary. The goal is to prevent adding a + * relatively small header (4 Kbytes on x86) to a large-ish power-of-2 ring + * buffer size (such as 128 Kbytes) and so end up making a nearly twice as + * large allocation that will be almost half wasted. As a contrasting example, + * on ARM64 with 64 Kbyte page size, we don't want to take 64 Kbytes for the + * header from a 128 Kbyte allocation, leaving only 64 Kbytes for the ring. + * In this latter case, we must add 64 Kbytes for the header and not worry + * about what's wasted. + */ +#define VMBUS_HEADER_ADJ(payload_sz) \ + ((payload_sz) >= 8 * sizeof(struct hv_ring_buffer) ? \ + 0 : sizeof(struct hv_ring_buffer)) + /* Calculate the proper size of a ringbuffer, it must be page-aligned */ -#define VMBUS_RING_SIZE(payload_sz) PAGE_ALIGN(sizeof(struct hv_ring_buffer) + \ +#define VMBUS_RING_SIZE(payload_sz) PAGE_ALIGN(VMBUS_HEADER_ADJ(payload_sz) + \ (payload_sz)) struct hv_ring_buffer_info { -- cgit v1.2.3 From 096361b15577a583afcc28179a08c75cf95e9dae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Feb 2024 13:54:25 +0000 Subject: ipv6: add ipv6_devconf_read_txrx cacheline_group IPv6 TX and RX fast path use the following fields: - disable_ipv6 - hop_limit - mtu6 - forwarding - disable_policy - proxy_ndp Place them in a group to increase data locality. Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/ipv6.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ef3aa060a289..383a0ea2ab91 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -3,6 +3,7 @@ #define _IPV6_H #include +#include #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) #define ipv6_authlen(p) (((p)->hdrlen+2) << 2) @@ -10,9 +11,16 @@ * This structure contains configuration options per IPv6 link. */ struct ipv6_devconf { - __s32 forwarding; + /* RX & TX fastpath fields. */ + __cacheline_group_begin(ipv6_devconf_read_txrx); + __s32 disable_ipv6; __s32 hop_limit; __s32 mtu6; + __s32 forwarding; + __s32 disable_policy; + __s32 proxy_ndp; + __cacheline_group_end(ipv6_devconf_read_txrx); + __s32 accept_ra; __s32 accept_redirects; __s32 autoconf; @@ -45,7 +53,6 @@ struct ipv6_devconf { __s32 accept_ra_rt_info_max_plen; #endif #endif - __s32 proxy_ndp; __s32 accept_source_route; __s32 accept_ra_from_local; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD @@ -55,7 +62,6 @@ struct ipv6_devconf { #ifdef CONFIG_IPV6_MROUTE atomic_t mc_forwarding; #endif - __s32 disable_ipv6; __s32 drop_unicast_in_l2_multicast; __s32 accept_dad; __s32 force_tllao; @@ -76,7 +82,6 @@ struct ipv6_devconf { #endif __u32 enhanced_dad; __u32 addr_gen_mode; - __s32 disable_policy; __s32 ndisc_tclass; __s32 rpl_seg_enabled; __u32 ioam6_id; -- cgit v1.2.3 From f29f9199c2d2b3c258f577f438885288016847ed Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 28 Feb 2024 15:05:29 +0100 Subject: Simplify net_dbg_ratelimited() dummy There is no need to wrap calls to the no_printk() helper inside an always-false check, as no_printk() already does that internally. Signed-off-by: Geert Uytterhoeven Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/net.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index c9b4a63791a4..15df6d5f27a7 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -299,10 +299,7 @@ do { \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) #else #define net_dbg_ratelimited(fmt, ...) \ - do { \ - if (0) \ - no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ - } while (0) + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif #define net_get_random_once(buf, nbytes) \ -- cgit v1.2.3 From eb2c11b27c58a62b5027b77f702c15cd0ca38f7d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Feb 2024 17:06:56 +0100 Subject: net: bql: fix building with BQL disabled It is now possible to disable BQL, but that causes the cpsw driver to break: drivers/net/ethernet/ti/am65-cpsw-nuss.c:297:28: error: no member named 'dql' in 'struct netdev_queue' 297 | dql_avail(&netif_txq->dql), There is already a helper function in net/sch_generic.h that could be used to help here. Move its implementation into the common linux/netdevice.h along with the other bql interfaces and change both users over to the new interface. Fixes: ea7f3cfaa588 ("net: bql: allow the config to be disabled") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9c973b92294..735a9386fcf8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3499,6 +3499,16 @@ static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue #endif } +static inline int netdev_queue_dql_avail(const struct netdev_queue *txq) +{ +#ifdef CONFIG_BQL + /* Non-BQL migrated drivers will return 0, too. */ + return dql_avail(&txq->dql); +#else + return 0; +#endif +} + /** * netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue -- cgit v1.2.3 From cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 12 Feb 2024 16:32:38 +0100 Subject: pidfd: add pidfs This moves pidfds from the anonymous inode infrastructure to a tiny pseudo filesystem. This has been on my todo for quite a while as it will unblock further work that we weren't able to do simply because of the very justified limitations of anonymous inodes. Moving pidfds to a tiny pseudo filesystem allows: * statx() on pidfds becomes useful for the first time. * pidfds can be compared simply via statx() and then comparing inode numbers. * pidfds have unique inode numbers for the system lifetime. * struct pid is now stashed in inode->i_private instead of file->private_data. This means it is now possible to introduce concepts that operate on a process once all file descriptors have been closed. A concrete example is kill-on-last-close. * file->private_data is freed up for per-file options for pidfds. * Each struct pid will refer to a different inode but the same struct pid will refer to the same inode if it's opened multiple times. In contrast to now where each struct pid refers to the same inode. Even if we were to move to anon_inode_create_getfile() which creates new inodes we'd still be associating the same struct pid with multiple different inodes. The tiny pseudo filesystem is not visible anywhere in userspace exactly like e.g., pipefs and sockfs. There's no lookup, there's no complex inode operations, nothing. Dentries and inodes are always deleted when the last pidfd is closed. We allocate a new inode for each struct pid and we reuse that inode for all pidfds. We use iget_locked() to find that inode again based on the inode number which isn't recycled. We allocate a new dentry for each pidfd that uses the same inode. That is similar to anonymous inodes which reuse the same inode for thousands of dentries. For pidfds we're talking way less than that. There usually won't be a lot of concurrent openers of the same struct pid. They can probably often be counted on two hands. I know that systemd does use separate pidfd for the same struct pid for various complex process tracking issues. So I think with that things actually become way simpler. Especially because we don't have to care about lookup. Dentries and inodes continue to be always deleted. The code is entirely optional and fairly small. If it's not selected we fallback to anonymous inodes. Heavily inspired by nsfs which uses a similar stashing mechanism just for namespaces. Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-2-f863f58cfce1@kernel.org Signed-off-by: Christian Brauner --- include/linux/pid.h | 5 +++-- include/linux/pidfs.h | 8 ++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 include/linux/pidfs.h (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 8124d57752b9..956481128e8d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -55,6 +55,9 @@ struct pid refcount_t count; unsigned int level; spinlock_t lock; +#ifdef CONFIG_FS_PID + unsigned long ino; +#endif /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; @@ -66,8 +69,6 @@ struct pid extern struct pid init_struct_pid; -extern const struct file_operations pidfd_fops; - struct file; struct pid *pidfd_pid(const struct file *file); diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h new file mode 100644 index 000000000000..75bdf9807802 --- /dev/null +++ b/include/linux/pidfs.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PID_FS_H +#define _LINUX_PID_FS_H + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); +void __init pidfs_init(void); + +#endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From 1fa08aece42512be072351f482096d5796edf7ca Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 18 Feb 2024 14:51:23 +0100 Subject: nsfs: convert to path_from_stashed() helper Use the newly added path_from_stashed() helper for nsfs. Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 +- include/linux/proc_ns.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 0f1d024bd958..7d22ea50b098 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -7,7 +7,7 @@ struct proc_ns_operations; struct ns_common { - atomic_long_t stashed; + struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; refcount_t count; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 49539bc416ce..5ea470eb4d76 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,7 +66,7 @@ static inline void proc_free_inum(unsigned int inum) {} static inline int ns_alloc_inum(struct ns_common *ns) { - atomic_long_set(&ns->stashed, 0); + WRITE_ONCE(ns->stashed, NULL); return proc_alloc_inum(&ns->inum); } -- cgit v1.2.3 From b28ddcc32d8fa3e20745b3a47dff863fe0376d79 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Feb 2024 16:30:57 +0100 Subject: pidfs: convert to path_from_stashed() helper Moving pidfds from the anonymous inode infrastructure to a separate tiny in-kernel filesystem similar to sockfs, pipefs, and anon_inodefs causes selinux denials and thus various userspace components that make heavy use of pidfds to fail as pidfds used anon_inode_getfile() which aren't subject to any LSM hooks. But dentry_open() is and that would cause regressions. The failures that are seen are selinux denials. But the core failure is dbus-broker. That cascades into other services failing that depend on dbus-broker. For example, when dbus-broker fails to start polkit and all the others won't be able to work because they depend on dbus-broker. The reason for dbus-broker failing is because it doesn't handle failures for SO_PEERPIDFD correctly. Last kernel release we introduced SO_PEERPIDFD (and SCM_PIDFD). SO_PEERPIDFD allows dbus-broker and polkit and others to receive a pidfd for the peer of an AF_UNIX socket. This is the first time in the history of Linux that we can safely authenticate clients in a race-free manner. dbus-broker immediately made use of this but messed up the error checking. It only allowed EINVAL as a valid failure for SO_PEERPIDFD. That's obviously problematic not just because of LSM denials but because of seccomp denials that would prevent SO_PEERPIDFD from working; or any other new error code from there. So this is catching a flawed implementation in dbus-broker as well. It has to fallback to the old pid-based authentication when SO_PEERPIDFD doesn't work no matter the reasons otherwise it'll always risk such failures. So overall that LSM denial should not have caused dbus-broker to fail. It can never assume that a feature released one kernel ago like SO_PEERPIDFD can be assumed to be available. So, the next fix separate from the selinux policy update is to try and fix dbus-broker at [3]. That should make it into Fedora as well. In addition the selinux reference policy should also be updated. See [4] for that. If Selinux is in enforcing mode in userspace and it encounters anything that it doesn't know about it will deny it by default. And the policy is entirely in userspace including declaring new types for stuff like nsfs or pidfs to allow it. For now we continue to raise S_PRIVATE on the inode if it's a pidfs inode which means things behave exactly like before. Link: https://bugzilla.redhat.com/show_bug.cgi?id=2265630 Link: https://github.com/fedora-selinux/selinux-policy/pull/2050 Link: https://github.com/bus1/dbus-broker/pull/343 [3] Link: https://github.com/SELinuxProject/refpolicy/pull/762 [4] Reported-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240222190334.GA412503@dev-arch.thelio-3990X Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner Signed-off-by: Christian Brauner --- include/linux/pid.h | 1 + include/linux/pidfs.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 956481128e8d..c79a0efd0258 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -56,6 +56,7 @@ struct pid unsigned int level; spinlock_t lock; #ifdef CONFIG_FS_PID + struct dentry *stashed; unsigned long ino; #endif /* lists of tasks that use this pid */ diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 75bdf9807802..40dd325a32a6 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,5 +4,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); +bool is_pidfs_sb(const struct super_block *sb); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From b42a905b6aad40c092cf17f4b295a4c389bc7206 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Feb 2024 15:40:25 +0100 Subject: iommu: constify of_phandle_args in xlate The xlate callbacks are supposed to translate of_phandle_args to proper provider without modifying the of_phandle_args. Make the argument pointer to const for code safety and readability. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240216144027.185959-2-krzysztof.kozlowski@linaro.org Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7cc56cfe98dd..98a958621dcb 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -567,7 +567,7 @@ struct iommu_ops { /* Request/Free a list of reserved regions for a device */ void (*get_resv_regions)(struct device *dev, struct list_head *list); - int (*of_xlate)(struct device *dev, struct of_phandle_args *args); + int (*of_xlate)(struct device *dev, const struct of_phandle_args *args); bool (*is_attach_deferred)(struct device *dev); /* Per device IOMMU features */ @@ -985,7 +985,7 @@ struct iommu_mm_data { int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, const struct iommu_ops *ops); void iommu_fwspec_free(struct device *dev); -int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids); +int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) -- cgit v1.2.3 From 5896e6e39b86c1d820b3ccf5caea9aef40c2eacd Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 16 Feb 2024 15:40:26 +0100 Subject: iommu: constify fwnode in iommu_ops_from_fwnode() Make pointer to fwnode_handle a pointer to const for code safety. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240216144027.185959-3-krzysztof.kozlowski@linaro.org Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 98a958621dcb..7c8032202457 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -986,7 +986,7 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, const struct iommu_ops *ops); void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); -const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode); +const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { @@ -1309,7 +1309,7 @@ static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, } static inline -const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) +const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode) { return NULL; } -- cgit v1.2.3 From f094323867668d50124886ad884b665de7319537 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:42 -0500 Subject: sunrpc: pass in the sv_stats struct through svc_create_pooled Since only one service actually reports the rpc stats there's not much of a reason to have a pointer to it in the svc_program struct. Adjust the svc_create_pooled function to take the sv_stats as an argument and pass the struct through there as desired instead of getting it from the svc_program->pg_stats. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 67cf1c9efd80..91a653eb3a50 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -411,7 +411,9 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, void svc_rqst_release_pages(struct svc_rqst *rqstp); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); -struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, +struct svc_serv * svc_create_pooled(struct svc_program *prog, + struct svc_stat *stats, + unsigned int bufsize, int (*threadfn)(void *data)); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_info *si, struct file *file); -- cgit v1.2.3 From 3f6ef182f144dcc9a4d942f97b6a8ed969f13c95 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jan 2024 10:39:43 -0500 Subject: sunrpc: remove ->pg_stats from svc_program Now that this isn't used anywhere, remove it. Signed-off-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 91a653eb3a50..23617da0e565 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -339,7 +339,6 @@ struct svc_program { const struct svc_version **pg_vers; /* version array */ char * pg_name; /* service name */ char * pg_class; /* class name: services sharing authentication */ - struct svc_stat * pg_stats; /* rpc statistics */ enum svc_auth_status (*pg_authenticate)(struct svc_rqst *rqstp); __be32 (*pg_init_request)(struct svc_rqst *, const struct svc_program *, -- cgit v1.2.3 From 71b43531ee0be6dcaa406132ebd540022dcb12ea Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 4 Feb 2024 18:17:22 -0500 Subject: svcrdma: Post Send WR chain Eventually I'd like the server to post the reply's Send WR along with any Write WRs using only a single call to ib_post_send(), in order to reduce the NIC's doorbell rate. To do this, add an anchor for a WR chain to svc_rdma_send_ctxt, and refactor svc_rdma_send() to post this WR chain to the Send Queue. For the moment, the posted chain will continue to contain a single Send WR. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index e7595ae62fe2..ee05087d6499 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -210,6 +210,8 @@ struct svc_rdma_send_ctxt { struct svcxprt_rdma *sc_rdma; struct ib_send_wr sc_send_wr; + struct ib_send_wr *sc_wr_chain; + int sc_sqecount; struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; @@ -258,8 +260,8 @@ extern struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma); extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt); +extern int svc_rdma_post_send(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_pcl *write_pcl, -- cgit v1.2.3 From a1f5788a0c250c87d3007d59d11a00ab98e66f01 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 4 Feb 2024 18:17:28 -0500 Subject: svcrdma: Move write_info for Reply chunks into struct svc_rdma_send_ctxt Since the RPC transaction's svc_rdma_send_ctxt will stay around for the duration of the RDMA Write operation, the write_info structure for the Reply chunk can reside in the request's svc_rdma_send_ctxt instead of being allocated separately. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index ee05087d6499..918cf4fda728 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -203,6 +203,29 @@ struct svc_rdma_recv_ctxt { struct page *rc_pages[RPCSVC_MAXPAGES]; }; +/* + * State for sending a Write chunk. + * - Tracks progress of writing one chunk over all its segments + * - Stores arguments for the SGL constructor functions + */ +struct svc_rdma_write_info { + struct svcxprt_rdma *wi_rdma; + + const struct svc_rdma_chunk *wi_chunk; + + /* write state of this chunk */ + unsigned int wi_seg_off; + unsigned int wi_seg_no; + + /* SGL constructor arguments */ + const struct xdr_buf *wi_xdr; + unsigned char *wi_base; + unsigned int wi_next_off; + + struct svc_rdma_chunk_ctxt wi_cc; + struct work_struct wi_work; +}; + struct svc_rdma_send_ctxt { struct llist_node sc_node; struct rpc_rdma_cid sc_cid; @@ -215,6 +238,7 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; + struct svc_rdma_write_info sc_reply_info; void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -249,6 +273,7 @@ extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, const struct xdr_buf *xdr); extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, const struct xdr_buf *xdr); extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, -- cgit v1.2.3 From 10e6fc1054d900a205e5233f186ebce9c50e1d1d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 4 Feb 2024 18:17:34 -0500 Subject: svcrdma: Post the Reply chunk and Send WR together Reduce the doorbell and Send completion rates when sending RPC/RDMA replies that have Reply chunks. NFS READDIR procedures typically return their result in a Reply chunk, for example. Instead of calling ib_post_send() to post the Write WRs for the Reply chunk, and then calling it again to post the Send WR that conveys the transport header, chain the Write WRs to the Send WR and call ib_post_send() only once. Thanks to the Send Queue completion ordering rules, when the Send WR completes, that guarantees that Write WRs posted before it have also completed successfully. Thus all Write WRs for the Reply chunk can remain unsignaled. Instead of handling a Write completion and then a Send completion, only the Send completion is seen, and it handles clean up for both the Writes and the Send. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 918cf4fda728..ac882bd23ca2 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -262,19 +262,24 @@ extern void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *ctxt); extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ +extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc); extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc); extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); +extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk, const struct xdr_buf *xdr); -extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - struct svc_rdma_send_ctxt *sctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head); -- cgit v1.2.3 From d2727cefff0204c3074a10e3ad2e1b5c9dfb986c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 4 Feb 2024 18:17:41 -0500 Subject: svcrdma: Post WRs for Write chunks in svc_rdma_sendto() Refactor to eventually enable svcrdma to post the Write WRs for each RPC response using the same ib_post_send() as the Send WR (ie, as a single WR chain). svc_rdma_result_payload (originally svc_rdma_read_payload) was added so that the upper layer XDR encoder could identify a range of bytes to be possibly conveyed by RDMA (if a Write chunk was provided by the client). The purpose of commit f6ad77590a5d ("svcrdma: Post RDMA Writes while XDR encoding replies") was to post as much of the result payload outside of svc_rdma_sendto() as possible because svc_rdma_sendto() used to be called with the xpt_mutex held. However, since commit ca4faf543a33 ("SUNRPC: Move xpt_mutex into socket xpo_sendto methods"), the xpt_mutex is no longer held when calling svc_rdma_sendto(). Thus, that benefit is no longer an issue. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index ac882bd23ca2..d33bab33099a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -272,9 +272,9 @@ extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, enum dma_data_direction dir); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr); +extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, -- cgit v1.2.3 From e084ee673c77cade06ab4c2e36b5624c82608b8c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 4 Feb 2024 18:17:47 -0500 Subject: svcrdma: Add Write chunk WRs to the RPC's Send WR chain Chain RDMA Writes that convey Write chunks onto the local Send chain. This means all WRs for an RPC Reply are now posted with a single ib_post_send() call, and there is a single Send completion when all of these are done. That reduces both the per-transport doorbell rate and completion rate. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d33bab33099a..24cd199dd6f3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -210,6 +210,7 @@ struct svc_rdma_recv_ctxt { */ struct svc_rdma_write_info { struct svcxprt_rdma *wi_rdma; + struct list_head wi_list; const struct svc_rdma_chunk *wi_chunk; @@ -238,7 +239,10 @@ struct svc_rdma_send_ctxt { struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; + + struct list_head sc_write_info_list; struct svc_rdma_write_info sc_reply_info; + void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -270,11 +274,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); +extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr); +extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_pcl *write_pcl, const struct svc_rdma_pcl *reply_pcl, -- cgit v1.2.3 From 631d4efb8009df64deae8c1382b8cf43879a4e22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:40 -0800 Subject: block: add a queue_limits_set helper Add a small wrapper around queue_limits_commit_update for stacking drivers that don't want to update existing limits, but set an entirely new set. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228225653.947152-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a14ea9344138..dd510ad7ce4b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -889,6 +889,7 @@ queue_limits_start_update(struct request_queue *q) } int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); +int queue_limits_set(struct request_queue *q, struct queue_limits *lim); /* * Access functions for manipulating queue properties -- cgit v1.2.3 From c1373f1cf452e4c7553a9d3bc05d87ec15c4f85f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:41 -0800 Subject: block: add a queue_limits_stack_bdev helper Add a small wrapper around blk_stack_limits that allows passing a bdev for the bottom device and prints an error in case of misaligned device. The name fits into the new queue limits API and the intent is to eventually replace disk_stack_limits. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228225653.947152-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dd510ad7ce4b..285e82723d64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -924,6 +924,8 @@ extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx); extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -- cgit v1.2.3 From 2be2a197ff6c3a659ab9285e1d88cbdc609ac6de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Feb 2024 15:23:36 +0100 Subject: sched/idle: Conditionally handle tick broadcast in default_idle_call() The x86 architecture has an idle routine for AMD CPUs which are affected by erratum 400. On the affected CPUs the local APIC timer stops in the C1E halt state. It therefore requires tick broadcasting. The invocation of tick_broadcast_enter()/exit() from this function violates the RCU constraints because it can end up in lockdep or tracing, which rightfully triggers a warning. tick_broadcast_enter()/exit() must be invoked before ct_cpuidle_enter() and after ct_cpuidle_exit() in default_idle_call(). Add a static branch conditional invocation of tick_broadcast_enter()/exit() into this function to allow X86 to replace the AMD specific idle code. It's guarded by a config switch which will be selected by x86. Otherwise it's a NOOP. Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240229142248.266708822@linutronix.de --- include/linux/cpu.h | 2 ++ include/linux/tick.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index dcb89c987164..715017d4432b 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -196,6 +196,8 @@ void arch_cpu_idle(void); void arch_cpu_idle_prepare(void); void arch_cpu_idle_enter(void); void arch_cpu_idle_exit(void); +void arch_tick_broadcast_enter(void); +void arch_tick_broadcast_exit(void); void __noreturn arch_cpu_idle_dead(void); #ifdef CONFIG_ARCH_HAS_CPU_FINALIZE_INIT diff --git a/include/linux/tick.h b/include/linux/tick.h index 716d17f31c45..e134f286df8a 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS extern void __init tick_init(void); @@ -63,6 +64,8 @@ enum tick_broadcast_state { TICK_BROADCAST_ENTER, }; +extern struct static_key_false arch_needs_tick_broadcast; + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_control(enum tick_broadcast_mode mode); #else -- cgit v1.2.3 From 71c2cc5cbf686c2397f43cbcb51a31589bdcee7b Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Fri, 1 Mar 2024 14:46:15 -0300 Subject: power: supply: core: make power_supply_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the power_supply_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240301-class_cleanup-power-v1-1-97e0b7bf9c94@marliere.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index c0992a77feea..514f652de64d 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -895,7 +895,7 @@ extern int power_supply_powers(struct power_supply *psy, struct device *dev); extern void *power_supply_get_drvdata(struct power_supply *psy); /* For APM emulation, think legacy userspace. */ -extern struct class *power_supply_class; +extern const struct class power_supply_class; static inline bool power_supply_is_amp_property(enum power_supply_property psp) { -- cgit v1.2.3 From bd1ebf2467f9c5d157bec7b025e83f8ffdae1318 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 29 Feb 2024 22:22:26 -0800 Subject: overflow: Allow non-type arg to type_max() and type_min() A common use of type_max() is to find the max for the type of a variable. Using the pattern type_max(typeof(var)) is needlessly verbose. Instead, since typeof(type) == type we can just explicitly call typeof() on the argument to type_max() and type_min(). Add wrappers for readability. We can do some replacements right away: $ git grep '\btype_\(min\|max\)(typeof' | wc -l 11 Link: https://lore.kernel.org/r/20240301062221.work.840-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/overflow.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index bc390f026128..aa691f2119b0 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -31,8 +31,10 @@ * credit to Christian Biere. */ #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) -#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) -#define type_min(T) ((T)((T)-type_max(T)-(T)1)) +#define __type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_max(t) __type_max(typeof(t)) +#define __type_min(T) ((T)((T)-type_max(T)-(T)1)) +#define type_min(t) __type_min(typeof(t)) /* * Avoids triggering -Wtype-limits compilation warning, @@ -207,10 +209,10 @@ static inline bool __must_check __must_check_overflow(bool overflow) #define __overflows_type_constexpr(x, T) ( \ is_unsigned_type(typeof(x)) ? \ - (x) > type_max(typeof(T)) : \ + (x) > type_max(T) : \ is_unsigned_type(typeof(T)) ? \ - (x) < 0 || (x) > type_max(typeof(T)) : \ - (x) < type_min(typeof(T)) || (x) > type_max(typeof(T))) + (x) < 0 || (x) > type_max(T) : \ + (x) < type_min(T) || (x) > type_max(T)) #define __overflows_type(x, T) ({ \ typeof(T) v = 0; \ -- cgit v1.2.3 From c3b9a398fb0dae67f91e7ae4bb492e04ac2c80c0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 29 Feb 2024 20:44:37 -0800 Subject: compiler.h: Explain how __is_constexpr() works The __is_constexpr() macro is dark magic. Shed some light on it with a comment to explain how and why it works. Acked-by: Gustavo A. R. Silva Reviewed-by: Jani Nikula Link: https://lore.kernel.org/r/20240301044428.work.411-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/compiler.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index bb1339c7057b..b688ad992127 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -231,6 +231,45 @@ static inline void *offset_to_ptr(const int *off) * This returns a constant expression while determining if an argument is * a constant expression, most importantly without evaluating the argument. * Glory to Martin Uecker + * + * Details: + * - sizeof() return an integer constant expression, and does not evaluate + * the value of its operand; it only examines the type of its operand. + * - The results of comparing two integer constant expressions is also + * an integer constant expression. + * - The first literal "8" isn't important. It could be any literal value. + * - The second literal "8" is to avoid warnings about unaligned pointers; + * this could otherwise just be "1". + * - (long)(x) is used to avoid warnings about 64-bit types on 32-bit + * architectures. + * - The C Standard defines "null pointer constant", "(void *)0", as + * distinct from other void pointers. + * - If (x) is an integer constant expression, then the "* 0l" resolves + * it into an integer constant expression of value 0. Since it is cast to + * "void *", this makes the second operand a null pointer constant. + * - If (x) is not an integer constant expression, then the second operand + * resolves to a void pointer (but not a null pointer constant: the value + * is not an integer constant 0). + * - The conditional operator's third operand, "(int *)8", is an object + * pointer (to type "int"). + * - The behavior (including the return type) of the conditional operator + * ("operand1 ? operand2 : operand3") depends on the kind of expressions + * given for the second and third operands. This is the central mechanism + * of the macro: + * - When one operand is a null pointer constant (i.e. when x is an integer + * constant expression) and the other is an object pointer (i.e. our + * third operand), the conditional operator returns the type of the + * object pointer operand (i.e. "int *). Here, within the sizeof(), we + * would then get: + * sizeof(*((int *)(...)) == sizeof(int) == 4 + * - When one operand is a void pointer (i.e. when x is not an integer + * constant expression) and the other is an object pointer (i.e. our + * third operand), the conditional operator returns a "void *" type. + * Here, within the sizeof(), we would then get: + * sizeof(*((void *)(...)) == sizeof(void) == 1 + * - The equality comparison to "sizeof(int)" therefore depends on (x): + * sizeof(int) == sizeof(int) (x) was a constant expression + * sizeof(int) != sizeof(void) (x) was not a constant expression */ #define __is_constexpr(x) \ (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) -- cgit v1.2.3 From 5e6107b499f3fc4748109e1d87fd9603b34f1e0d Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 28 Jan 2024 20:43:58 +0200 Subject: net/mlx5: Check capability for fw_reset Functions which can't access MFRL (Management Firmware Reset Level) register, have no use of fw_reset structures or events. Remove fw_reset structures allocation and registration for fw reset events notifications for these functions. Having the devlink param enable_remote_dev_reset on functions that don't have this capability is misleading as these functions are not allowed to influence the reset flow. Hence, this patch removes this parameter for such functions. In addition, return not supported on devlink reload action fw_activate for these functions. Fixes: 38b9f903f22b ("net/mlx5: Handle sync reset request event") Signed-off-by: Moshe Shemesh Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3fd6310b6da6..486b7492050c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10261,7 +10261,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_63_to_46[0x12]; u8 mrtc[0x1]; - u8 regs_44_to_32[0xd]; + u8 regs_44_to_41[0x4]; + u8 mfrl[0x1]; + u8 regs_39_to_32[0x8]; u8 regs_31_to_10[0x16]; u8 mtmp[0x1]; -- cgit v1.2.3 From 7838b4656110d950afdd92a081cc0f33e23e0ea8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 25 Feb 2024 11:01:41 +0800 Subject: block: define bvec_iter as __packed __aligned(4) In commit 19416123ab3e ("block: define 'struct bvec_iter' as packed"), what we need is to save the 4byte padding, and avoid `bio` to spread on one extra cache line. It is enough to define it as '__packed __aligned(4)', as '__packed' alone means byte aligned, and can cause compiler to generate horrible code on architectures that don't support unaligned access in case that bvec_iter is embedded in other structures. Cc: Mikulas Patocka Suggested-by: Linus Torvalds Fixes: 19416123ab3e ("block: define 'struct bvec_iter' as packed") Signed-off-by: Ming Lei Signed-off-by: Linus Torvalds --- include/linux/bvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 555aae5448ae..bd1e361b351c 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -83,7 +83,7 @@ struct bvec_iter { unsigned int bi_bvec_done; /* number of bytes completed in current bvec */ -} __packed; +} __packed __aligned(4); struct bvec_iter_all { struct bio_vec bv; -- cgit v1.2.3 From a74c0c9c3f7fa6fba34196d142bab93509f17dba Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 29 Feb 2024 14:23:57 +0100 Subject: USB: typec: no opencoding FIELD_GET We have a macro. It should be used. Signed-off-by: Oliver Neukum Reviewed-by: Mika Westerberg Link: https://lore.kernel.org/r/20240229132401.3270-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_tbt.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h index c7a2153bd6f5..fa97d7e00f5c 100644 --- a/include/linux/usb/typec_tbt.h +++ b/include/linux/usb/typec_tbt.h @@ -3,6 +3,7 @@ #define __USB_TYPEC_TBT_H #include +#include #define USB_TYPEC_VENDOR_INTEL 0x8087 /* Alias for convenience */ @@ -25,7 +26,7 @@ struct typec_thunderbolt_data { /* TBT3 Device Discover Mode VDO bits */ #define TBT_MODE BIT(0) -#define TBT_ADAPTER(_vdo_) (((_vdo_) & BIT(16)) >> 16) +#define TBT_ADAPTER(_vdo_) FIELD_GET(BIT(16), _vdo_) #define TBT_ADAPTER_LEGACY 0 #define TBT_ADAPTER_TBT3 1 #define TBT_INTEL_SPECIFIC_B0 BIT(26) @@ -35,12 +36,12 @@ struct typec_thunderbolt_data { #define TBT_SET_ADAPTER(a) (((a) & 1) << 16) /* TBT3 Cable Discover Mode VDO bits */ -#define TBT_CABLE_SPEED(_vdo_) (((_vdo_) & GENMASK(18, 16)) >> 16) +#define TBT_CABLE_SPEED(_vdo_) FIELD_GET(GENMASK(18, 16), _vdo_) #define TBT_CABLE_USB3_GEN1 1 #define TBT_CABLE_USB3_PASSIVE 2 #define TBT_CABLE_10_AND_20GBPS 3 -#define TBT_CABLE_ROUNDED_SUPPORT(_vdo_) \ - (((_vdo_) & GENMASK(20, 19)) >> 19) +#define TBT_CABLE_ROUNDED_SUPPORT(_vdo_) FIELD_GET(GENMASK(20, 19), _vdo_) + #define TBT_GEN3_NON_ROUNDED 0 #define TBT_GEN3_GEN4_ROUNDED_NON_ROUNDED 1 #define TBT_CABLE_OPTICAL BIT(21) -- cgit v1.2.3 From 25802f3ab7a589d2b5d9e1266acffad263d9893e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:25 +0200 Subject: nvme-rdma: move NVME_RDMA_IP_PORT from common file The correct place for this definition is the nvme rdma header file and not the common nvme header file. Reviewed-by: Christoph Hellwig Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 2 ++ include/linux/nvme.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 4dd7e6fe92fb..146dd2223a5f 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -6,6 +6,8 @@ #ifndef _LINUX_NVME_RDMA_H #define _LINUX_NVME_RDMA_H +#define NVME_RDMA_IP_PORT 4420 + #define NVME_RDMA_MAX_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bc605ec4a3fd..ce0c1143c7e4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -23,8 +23,6 @@ #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" -#define NVME_RDMA_IP_PORT 4420 - #define NVME_NSID_ALL 0xffffffff enum nvme_subsys_type { -- cgit v1.2.3 From 36144964062b8676ee64281852de2a2c1b193aca Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:29 +0200 Subject: nvme-rdma: introduce NVME_RDMA_MAX_METADATA_QUEUE_SIZE definition This definition will be used by controllers that are configured with metadata support. For now, both regular and metadata controllers have the same maximal queue size but later commit will increase the maximal queue size for regular RDMA controllers to 256. We'll keep the maximal queue size for metadata controllers to be 128 since there are more resources that are needed for metadata operations and 128 is the optimal size found for metadata controllers base on testing. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 146dd2223a5f..d0b9941911a1 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -8,7 +8,8 @@ #define NVME_RDMA_IP_PORT 4420 -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, -- cgit v1.2.3 From f096ba3286f5e773c496cf81667d01f2e8a2a37b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:32 +0200 Subject: nvmet-rdma: set max_queue_size for RDMA transport A new port configuration was added to set max_queue_size. Clamp user configuration to RDMA transport limits. Increase the maximal queue size of RDMA controllers from 128 to 256 (the default size stays 128 same as before). Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index d0b9941911a1..eb2f04d636c8 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -8,8 +8,9 @@ #define NVME_RDMA_IP_PORT 4420 -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_QUEUE_SIZE 256 #define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 +#define NVME_RDMA_DEFAULT_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, -- cgit v1.2.3 From caddc92c57451d983c7e31e60b961c5aae4ece63 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sat, 2 Mar 2024 19:33:29 +0200 Subject: gpio: nomadik: Finish conversion to use firmware node APIs Previously driver got a few updates in order to replace OF APIs by respective firmware node, however it was not finished to the logical end, e.g., some APIs that has been used are still require OF node to be passed. Finish that job by converting leftovers to use firmware node APIs. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240302173401.217830-1-andy.shevchenko@gmail.com Signed-off-by: Linus Walleij --- include/linux/gpio/gpio-nomadik.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index 9bdb09fda4c9..4a95ea7935fb 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -2,6 +2,8 @@ #ifndef __LINUX_GPIO_NOMADIK_H #define __LINUX_GPIO_NOMADIK_H +struct fwnode_handle; + /* Package definitions */ #define PINCTRL_NMK_STN8815 0 #define PINCTRL_NMK_DB8500 1 @@ -263,7 +265,7 @@ void __nmk_gpio_make_output(struct nmk_gpio_chip *nmk_chip, unsigned int offset, int val); void __nmk_gpio_set_slpm(struct nmk_gpio_chip *nmk_chip, unsigned int offset, enum nmk_gpio_slpm mode); -struct nmk_gpio_chip *nmk_gpio_populate_chip(struct device_node *np, +struct nmk_gpio_chip *nmk_gpio_populate_chip(struct fwnode_handle *fwnode, struct platform_device *pdev); /* Symbols declared in pinctrl-nomadik used by gpio-nomadik. */ -- cgit v1.2.3 From c029b22f8a98e14988f800d5c0176a9eaec3c8db Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 14 Dec 2023 21:31:48 +1100 Subject: of: Add of_machine_compatible_match() We have of_machine_is_compatible() to check if a machine is compatible with a single compatible string. However some code is able to support multiple compatible boards, and so wants to check for one of many compatible strings. So add of_machine_compatible_match() which takes a NULL terminated array of compatible strings to check against the root node's compatible property. Compared to an open coded match this is slightly more self documenting, and also avoids the caller needing to juggle the root node either directly or via of_find_node_by_path(). Signed-off-by: Christophe Leroy Reviewed-by: Rob Herring Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214103152.12269-1-mpe@ellerman.id.au --- include/linux/of.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 6a9ddf20e79a..e3418babc203 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -403,6 +403,7 @@ extern int of_alias_get_id(struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); extern int of_machine_is_compatible(const char *compat); +bool of_machine_compatible_match(const char *const *compats); extern int of_add_property(struct device_node *np, struct property *prop); extern int of_remove_property(struct device_node *np, struct property *prop); @@ -808,6 +809,11 @@ static inline int of_remove_property(struct device_node *np, struct property *pr return 0; } +static inline bool of_machine_compatible_match(const char *const *compats) +{ + return false; +} + static inline bool of_console_check(const struct device_node *dn, const char *name, int index) { return false; -- cgit v1.2.3 From cefdb366dcbe97908b6055595a15bf7689556bf8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 14 Dec 2023 21:31:49 +1100 Subject: of: Change of_machine_is_compatible() to return bool of_machine_is_compatible() currently returns a positive integer if it finds a match. However none of the callers ever check the value, they all treat it as a true/false. So change of_machine_is_compatible() to return bool, which will allow the implementation to be changed in a subsequent patch. Suggested-by: Rob Herring Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214103152.12269-2-mpe@ellerman.id.au --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index e3418babc203..c5c663a7fb77 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -402,7 +402,7 @@ extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)); extern int of_alias_get_id(struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); -extern int of_machine_is_compatible(const char *compat); +extern bool of_machine_is_compatible(const char *compat); bool of_machine_compatible_match(const char *const *compats); extern int of_add_property(struct device_node *np, struct property *prop); -- cgit v1.2.3 From 1ac8205f907517a306b661212496fedce79d7cc5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 14 Dec 2023 21:31:50 +1100 Subject: of: Reimplement of_machine_is_compatible() using of_machine_compatible_match() of_machine_compatible_match() works with a table of strings. of_machine_is_compatible() is a simplier version with only one string. Re-implement of_machine_is_compatible() by setting a table of strings with a single string then using of_machine_compatible_match(). Suggested-by: Rob Herring Signed-off-by: Christophe Leroy Reviewed-by: Rob Herring Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214103152.12269-3-mpe@ellerman.id.au --- include/linux/of.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index c5c663a7fb77..03ed4e37ca57 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -402,9 +402,21 @@ extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)); extern int of_alias_get_id(struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); -extern bool of_machine_is_compatible(const char *compat); bool of_machine_compatible_match(const char *const *compats); +/** + * of_machine_is_compatible - Test root of device tree for a given compatible value + * @compat: compatible string to look for in root node's compatible property. + * + * Return: true if the root node has the given value in its compatible property. + */ +static inline bool of_machine_is_compatible(const char *compat) +{ + const char *compats[] = { compat, NULL }; + + return of_machine_compatible_match(compats); +} + extern int of_add_property(struct device_node *np, struct property *prop); extern int of_remove_property(struct device_node *np, struct property *prop); extern int of_update_property(struct device_node *np, struct property *newprop); -- cgit v1.2.3 From 99fea943d9dc2500227bced9acd671e5b39a1471 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Mon, 19 Feb 2024 09:14:36 -0300 Subject: soundwire: constify the struct device_type usage Since commit aed65af1cc2f ("drivers: make device_type const"), the driver core can properly handle constant struct device_type. Move the sdw_master_type and sdw_slave_type variables to be constant structures as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Signed-off-by: "Ricardo B. Marliere" Link: https://lore.kernel.org/r/20240219-device_cleanup-soundwire-v1-1-9edd51767611@marliere.net Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_type.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_type.h b/include/linux/soundwire/sdw_type.h index b445f7200f06..693320b4f5c2 100644 --- a/include/linux/soundwire/sdw_type.h +++ b/include/linux/soundwire/sdw_type.h @@ -5,8 +5,8 @@ #define __SOUNDWIRE_TYPES_H extern const struct bus_type sdw_bus_type; -extern struct device_type sdw_slave_type; -extern struct device_type sdw_master_type; +extern const struct device_type sdw_slave_type; +extern const struct device_type sdw_master_type; static inline int is_sdw_slave(const struct device *dev) { -- cgit v1.2.3 From fbd5f5008fab2203fa21e82579b9b48a7256b8fd Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Sun, 3 Mar 2024 15:05:42 -0800 Subject: Input: serio - make serio_bus const Now that the driver core can properly handle constant struct bus_type, move the serio_bus variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Link: https://lore.kernel.org/r/20240210-bus_cleanup-input2-v1-2-0daef7e034e0@marliere.net Signed-off-by: Dmitry Torokhov --- include/linux/serio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serio.h b/include/linux/serio.h index 6c27d413da92..7ca41af93b37 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -15,7 +15,7 @@ #include #include -extern struct bus_type serio_bus; +extern const struct bus_type serio_bus; struct serio { void *port_data; -- cgit v1.2.3 From cc15bd10e716fcb472d611f24d76c795acb0f8c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Feb 2024 09:39:08 +0000 Subject: net: adopt skb_network_header_len() more broadly (skb_transport_header(skb) - skb_network_header(skb)) can be replaced by skb_network_header_len(skb) Add a DEBUG_NET_WARN_ON_ONCE() in skb_network_header_len() to catch cases were the transport_header was not set. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1470b74fb6d2..d577e0bee18d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3036,6 +3036,7 @@ static inline int skb_transport_offset(const struct sk_buff *skb) static inline u32 skb_network_header_len(const struct sk_buff *skb) { + DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)); return skb->transport_header - skb->network_header; } -- cgit v1.2.3 From 712610725c48c829e42bebfc9908cd92468e2731 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Mar 2024 11:12:22 +0100 Subject: smp: Consolidate smp_prepare_boot_cpu() There is no point in having seven architectures implementing the same empty stub. Provide a weak function in the init code and remove the stubs. This also allows to utilize the function on UP which is required to sanitize the per CPU handling on X86 UP. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240304005104.567671691@linutronix.de --- include/linux/smp.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index e87520dc2959..b84592950149 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -105,6 +105,12 @@ static inline void on_each_cpu_cond(smp_cond_func_t cond_func, on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); } +/* + * Architecture specific boot CPU setup. Defined as empty weak function in + * init/main.c. Architectures can override it. + */ +void smp_prepare_boot_cpu(void); + #ifdef CONFIG_SMP #include @@ -171,12 +177,6 @@ void generic_smp_call_function_single_interrupt(void); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt -/* - * Mark the boot cpu "online" so that it can call console drivers in - * printk() and can access its per-cpu storage. - */ -void smp_prepare_boot_cpu(void); - extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); @@ -203,7 +203,6 @@ static inline void up_smp_call_function(smp_call_func_t func, void *info) (up_smp_call_function(func, info)) static inline void smp_send_reschedule(int cpu) { } -#define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ (up_smp_call_function(func, info)) static inline void call_function_init(void) { } -- cgit v1.2.3 From eb52034436a58e742ceea0dcf2f003f83a3449a5 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Mon, 19 Feb 2024 09:38:47 -0300 Subject: i2c: constify the struct device_type usage Since commit aed65af1cc2f ("drivers: make device_type const"), the driver core can properly handle constant struct device_type. Move the i2c_adapter_type and i2c_client_type variables to be constant structures as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: Ricardo B. Marliere Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 652ecb7abeda..ff93ff8b257c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -24,8 +24,8 @@ #include extern const struct bus_type i2c_bus_type; -extern struct device_type i2c_adapter_type; -extern struct device_type i2c_client_type; +extern const struct device_type i2c_adapter_type; +extern const struct device_type i2c_client_type; /* --- General options ------------------------------------------------ */ -- cgit v1.2.3 From 469f6acd9a538ea963e2d4d13ba721a7ad1c1813 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 13 Feb 2024 11:46:25 -0300 Subject: tee: make tee_bus_type const Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the tee_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Sumit Garg Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jens Wiklander Signed-off-by: Arnd Bergmann --- include/linux/tee_drv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 911ddf92dcee..71632e3c5f18 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -482,7 +482,7 @@ static inline bool tee_param_is_memref(struct tee_param *param) } } -extern struct bus_type tee_bus_type; +extern const struct bus_type tee_bus_type; /** * struct tee_client_device - tee based device -- cgit v1.2.3 From ad8ee969d7e34dd310a618d798cc6d8fe4f04464 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sun, 3 Mar 2024 11:48:53 +0100 Subject: of: make for_each_property_of_node() available to to !OF for_each_property_of_node() is a macro and so doesn't have a stub inline function for !OF. Move it out of the relevant #ifdef to make it available to all users. Fixes: 611cad720148 ("dt: add of_alias_scan and of_alias_get_id") Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20240303104853.31511-1-brgl@bgdev.pl Signed-off-by: Rob Herring --- include/linux/of.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 6a9ddf20e79a..a3e8e429ad7f 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -362,9 +362,6 @@ extern struct device_node *of_get_cpu_state_node(struct device_node *cpu_node, int index); extern u64 of_get_cpu_hwid(struct device_node *cpun, unsigned int thread); -#define for_each_property_of_node(dn, pp) \ - for (pp = dn->properties; pp != NULL; pp = pp->next) - extern int of_n_addr_cells(struct device_node *np); extern int of_n_size_cells(struct device_node *np); extern const struct of_device_id *of_match_node( @@ -892,6 +889,9 @@ static inline int of_prop_val_eq(struct property *p1, struct property *p2) !memcmp(p1->value, p2->value, (size_t)p1->length); } +#define for_each_property_of_node(dn, pp) \ + for (pp = dn->properties; pp != NULL; pp = pp->next) + #if defined(CONFIG_OF) && defined(CONFIG_NUMA) extern int of_node_to_nid(struct device_node *np); #else -- cgit v1.2.3 From 187e2af05abe6bf80581490239c449456627d17a Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Sat, 24 Feb 2024 14:34:17 -0800 Subject: bpf: struct_ops supports more than one page for trampolines. The BPF struct_ops previously only allowed one page of trampolines. Each function pointer of a struct_ops is implemented by a struct_ops bpf program. Each struct_ops bpf program requires a trampoline. The following selftest patch shows each page can hold a little more than 20 trampolines. While one page is more than enough for the tcp-cc usecase, the sched_ext use case shows that one page is not always enough and hits the one page limit. This patch overcomes the one page limit by allocating another page when needed and it is limited to a total of MAX_IMAGE_PAGES (8) pages which is more than enough for reasonable usages. The variable st_map->image has been changed to st_map->image_pages, and its type has been changed to an array of pointers to pages. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20240224223418.526631-3-thinker.li@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 814dc913a968..785660810e6a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1763,7 +1763,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, void *stub_func, - void *image, void *image_end); + void **image, u32 *image_off, + bool allow_alloc); +void bpf_struct_ops_image_free(void *image); static inline bool bpf_try_module_get(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) -- cgit v1.2.3 From 803de9000f334b771afacb6ff3e78622916668b0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 21 Feb 2024 12:43:58 +0100 Subject: mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations Sven reports an infinite loop in __alloc_pages_slowpath() for costly order __GFP_RETRY_MAYFAIL allocations that are also GFP_NOIO. Such combination can happen in a suspend/resume context where a GFP_KERNEL allocation can have __GFP_IO masked out via gfp_allowed_mask. Quoting Sven: 1. try to do a "costly" allocation (order > PAGE_ALLOC_COSTLY_ORDER) with __GFP_RETRY_MAYFAIL set. 2. page alloc's __alloc_pages_slowpath tries to get a page from the freelist. This fails because there is nothing free of that costly order. 3. page alloc tries to reclaim by calling __alloc_pages_direct_reclaim, which bails out because a zone is ready to be compacted; it pretends to have made a single page of progress. 4. page alloc tries to compact, but this always bails out early because __GFP_IO is not set (it's not passed by the snd allocator, and even if it were, we are suspending so the __GFP_IO flag would be cleared anyway). 5. page alloc believes reclaim progress was made (because of the pretense in item 3) and so it checks whether it should retry compaction. The compaction retry logic thinks it should try again, because: a) reclaim is needed because of the early bail-out in item 4 b) a zonelist is suitable for compaction 6. goto 2. indefinite stall. (end quote) The immediate root cause is confusing the COMPACT_SKIPPED returned from __alloc_pages_direct_compact() (step 4) due to lack of __GFP_IO to be indicating a lack of order-0 pages, and in step 5 evaluating that in should_compact_retry() as a reason to retry, before incrementing and limiting the number of retries. There are however other places that wrongly assume that compaction can happen while we lack __GFP_IO. To fix this, introduce gfp_compaction_allowed() to abstract the __GFP_IO evaluation and switch the open-coded test in try_to_compact_pages() to use it. Also use the new helper in: - compaction_ready(), which will make reclaim not bail out in step 3, so there's at least one attempt to actually reclaim, even if chances are small for a costly order - in_reclaim_compaction() which will make should_continue_reclaim() return false and we don't over-reclaim unnecessarily - in __alloc_pages_slowpath() to set a local variable can_compact, which is then used to avoid retrying reclaim/compaction for costly allocations (step 5) if we can't compact and also to skip the early compaction attempt that we do in some cases Link: https://lkml.kernel.org/r/20240221114357.13655-2-vbabka@suse.cz Fixes: 3250845d0526 ("Revert "mm, oom: prevent premature OOM killer invocation for high order request"") Signed-off-by: Vlastimil Babka Reported-by: Sven van Ashbrook Closes: https://lore.kernel.org/all/CAG-rBihs_xMKb3wrMO1%2B-%2Bp4fowP9oy1pa_OTkfxBzPUVOZF%2Bg@mail.gmail.com/ Tested-by: Karthikeyan Ramasubramanian Cc: Brian Geffon Cc: Curtis Malainey Cc: Jaroslav Kysela Cc: Mel Gorman Cc: Michal Hocko Cc: Takashi Iwai Cc: Signed-off-by: Andrew Morton --- include/linux/gfp.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..e2a916cf29c4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -353,6 +353,15 @@ static inline bool gfp_has_io_fs(gfp_t gfp) return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } +/* + * Check if the gfp flags allow compaction - GFP_NOIO is a really + * tricky context because the migration might require IO. + */ +static inline bool gfp_compaction_allowed(gfp_t gfp_mask) +{ + return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO); +} + extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC -- cgit v1.2.3 From 772dd0342727cc3c3b676b5d97c99708e75730a2 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 23 Feb 2024 17:58:00 -0800 Subject: mm: enumerate all gfp flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce GFP bits enumeration to let compiler track the number of used bits (which depends on the config options) instead of hardcoding them. That simplifies __GFP_BITS_SHIFT calculation. Link: https://lkml.kernel.org/r/20240224015800.2569851-1-surenb@google.com Suggested-by: Petr Tesařík Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Reviewed-by: Pasha Tatashin Acked-by: Michal Hocko Cc: Kent Overstreet Cc: Petr Tesarik Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 90 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 1b6053da8754..868c8fb1bbc1 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -21,44 +21,78 @@ typedef unsigned int __bitwise gfp_t; * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c */ +enum { + ___GFP_DMA_BIT, + ___GFP_HIGHMEM_BIT, + ___GFP_DMA32_BIT, + ___GFP_MOVABLE_BIT, + ___GFP_RECLAIMABLE_BIT, + ___GFP_HIGH_BIT, + ___GFP_IO_BIT, + ___GFP_FS_BIT, + ___GFP_ZERO_BIT, + ___GFP_UNUSED_BIT, /* 0x200u unused */ + ___GFP_DIRECT_RECLAIM_BIT, + ___GFP_KSWAPD_RECLAIM_BIT, + ___GFP_WRITE_BIT, + ___GFP_NOWARN_BIT, + ___GFP_RETRY_MAYFAIL_BIT, + ___GFP_NOFAIL_BIT, + ___GFP_NORETRY_BIT, + ___GFP_MEMALLOC_BIT, + ___GFP_COMP_BIT, + ___GFP_NOMEMALLOC_BIT, + ___GFP_HARDWALL_BIT, + ___GFP_THISNODE_BIT, + ___GFP_ACCOUNT_BIT, + ___GFP_ZEROTAGS_BIT, +#ifdef CONFIG_KASAN_HW_TAGS + ___GFP_SKIP_ZERO_BIT, + ___GFP_SKIP_KASAN_BIT, +#endif +#ifdef CONFIG_LOCKDEP + ___GFP_NOLOCKDEP_BIT, +#endif + ___GFP_LAST_BIT +}; + /* Plain integer GFP bitmasks. Do not use this directly. */ -#define ___GFP_DMA 0x01u -#define ___GFP_HIGHMEM 0x02u -#define ___GFP_DMA32 0x04u -#define ___GFP_MOVABLE 0x08u -#define ___GFP_RECLAIMABLE 0x10u -#define ___GFP_HIGH 0x20u -#define ___GFP_IO 0x40u -#define ___GFP_FS 0x80u -#define ___GFP_ZERO 0x100u +#define ___GFP_DMA BIT(___GFP_DMA_BIT) +#define ___GFP_HIGHMEM BIT(___GFP_HIGHMEM_BIT) +#define ___GFP_DMA32 BIT(___GFP_DMA32_BIT) +#define ___GFP_MOVABLE BIT(___GFP_MOVABLE_BIT) +#define ___GFP_RECLAIMABLE BIT(___GFP_RECLAIMABLE_BIT) +#define ___GFP_HIGH BIT(___GFP_HIGH_BIT) +#define ___GFP_IO BIT(___GFP_IO_BIT) +#define ___GFP_FS BIT(___GFP_FS_BIT) +#define ___GFP_ZERO BIT(___GFP_ZERO_BIT) /* 0x200u unused */ -#define ___GFP_DIRECT_RECLAIM 0x400u -#define ___GFP_KSWAPD_RECLAIM 0x800u -#define ___GFP_WRITE 0x1000u -#define ___GFP_NOWARN 0x2000u -#define ___GFP_RETRY_MAYFAIL 0x4000u -#define ___GFP_NOFAIL 0x8000u -#define ___GFP_NORETRY 0x10000u -#define ___GFP_MEMALLOC 0x20000u -#define ___GFP_COMP 0x40000u -#define ___GFP_NOMEMALLOC 0x80000u -#define ___GFP_HARDWALL 0x100000u -#define ___GFP_THISNODE 0x200000u -#define ___GFP_ACCOUNT 0x400000u -#define ___GFP_ZEROTAGS 0x800000u +#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) +#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) +#define ___GFP_WRITE BIT(___GFP_WRITE_BIT) +#define ___GFP_NOWARN BIT(___GFP_NOWARN_BIT) +#define ___GFP_RETRY_MAYFAIL BIT(___GFP_RETRY_MAYFAIL_BIT) +#define ___GFP_NOFAIL BIT(___GFP_NOFAIL_BIT) +#define ___GFP_NORETRY BIT(___GFP_NORETRY_BIT) +#define ___GFP_MEMALLOC BIT(___GFP_MEMALLOC_BIT) +#define ___GFP_COMP BIT(___GFP_COMP_BIT) +#define ___GFP_NOMEMALLOC BIT(___GFP_NOMEMALLOC_BIT) +#define ___GFP_HARDWALL BIT(___GFP_HARDWALL_BIT) +#define ___GFP_THISNODE BIT(___GFP_THISNODE_BIT) +#define ___GFP_ACCOUNT BIT(___GFP_ACCOUNT_BIT) +#define ___GFP_ZEROTAGS BIT(___GFP_ZEROTAGS_BIT) #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_ZERO 0x1000000u -#define ___GFP_SKIP_KASAN 0x2000000u +#define ___GFP_SKIP_ZERO BIT(___GFP_SKIP_ZERO_BIT) +#define ___GFP_SKIP_KASAN BIT(___GFP_SKIP_KASAN_BIT) #else #define ___GFP_SKIP_ZERO 0 #define ___GFP_SKIP_KASAN 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP BIT(___GFP_NOLOCKDEP_BIT) #else #define ___GFP_NOLOCKDEP 0 #endif -/* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -249,7 +283,7 @@ typedef unsigned int __bitwise gfp_t; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT ___GFP_LAST_BIT #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** -- cgit v1.2.3 From 502003bb76b83649cd4ff7f701987ac5cf43bc4b Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:29 -0500 Subject: mm/memcg: use order instead of nr in split_page_memcg() We do not have non power of two pages, using nr is error prone if nr is not power-of-two. Use page order instead. Link: https://lkml.kernel.org/r/20240226205534.1603748-4-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4e4caeaea404..173bbb53c1ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1163,7 +1163,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, unsigned int nr); +void split_page_memcg(struct page *head, int order); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1621,7 +1621,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, unsigned int nr) +static inline void split_page_memcg(struct page *head, int order) { } -- cgit v1.2.3 From 9a581c12cddb06696fe4811239934fcde57ceb91 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:30 -0500 Subject: mm/page_owner: use order instead of nr in split_page_owner() We do not have non power of two pages, using nr is error prone if nr is not power-of-two. Use page order instead. Link: https://lkml.kernel.org/r/20240226205534.1603748-5-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/page_owner.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 119a0c9d2a8b..2b39c8e19d98 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,7 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int nr); +extern void __split_page_owner(struct page *page, int order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); @@ -31,10 +31,10 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int nr) +static inline void split_page_owner(struct page *page, int order) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, nr); + __split_page_owner(page, order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { @@ -59,8 +59,7 @@ static inline void set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { } -static inline void split_page_owner(struct page *page, - unsigned short order) +static inline void split_page_owner(struct page *page, int order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) -- cgit v1.2.3 From b8791381d7edae3706edde207f52d9e483ed400c Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:31 -0500 Subject: mm: memcg: make memcg huge page split support any order split It sets memcg information for the pages after the split. A new parameter new_order is added to tell the order of subpages in the new page, always 0 for now. It prepares for upcoming changes to support split huge page to any lower order. Link: https://lkml.kernel.org/r/20240226205534.1603748-6-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 173bbb53c1ec..9a2dea92be0e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1163,7 +1163,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, int order); +void split_page_memcg(struct page *head, int old_order, int new_order); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1621,7 +1621,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, int order) +static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } -- cgit v1.2.3 From 46d44d09d24c5b451d6ab8f0fca5a40f651e3837 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:32 -0500 Subject: mm: page_owner: add support for splitting to any order in split page_owner It adds a new_order parameter to set new page order in page owner. It prepares for upcoming changes to support split huge page to any lower order. Link: https://lkml.kernel.org/r/20240226205534.1603748-7-zi.yan@sent.com Signed-off-by: Zi Yan Cc: David Hildenbrand Acked-by: David Hildenbrand Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/page_owner.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 2b39c8e19d98..debdc25f08b9 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,8 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, int order); +extern void __split_page_owner(struct page *page, int old_order, + int new_order); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); @@ -31,10 +32,11 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, int order) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, order); + __split_page_owner(page, old_order, new_order); } static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { @@ -56,10 +58,11 @@ static inline void reset_page_owner(struct page *page, unsigned short order) { } static inline void set_page_owner(struct page *page, - unsigned int order, gfp_t gfp_mask) + unsigned short order, gfp_t gfp_mask) { } -static inline void split_page_owner(struct page *page, int order) +static inline void split_page_owner(struct page *page, int old_order, + int new_order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) -- cgit v1.2.3 From c010d47f107f609b9f4d6a103b6dfc53889049e9 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:33 -0500 Subject: mm: thp: split huge page to any lower order pages To split a THP to any lower order pages, we need to reform THPs on subpages at given order and add page refcount based on the new page order. Also we need to reinitialize page_deferred_list after removing the page from the split_queue, otherwise a subsequent split will see list corruption when checking the page_deferred_list again. Note: Anonymous order-1 folio is not supported because _deferred_list, which is used by partially mapped folios, is stored in subpage 2 and an order-1 folio only has subpage 0 and 1. File-backed order-1 folios are fine, since they do not use _deferred_list. [ziy@nvidia.com: fixup per discussion with Ryan] Link: https://lkml.kernel.org/r/494F48CD-1F0F-4CAD-884E-6D48F40AF990@nvidia.com Link: https://lkml.kernel.org/r/20240226205534.1603748-8-zi.yan@sent.com Signed-off-by: Zi Yan Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 5adb86af35fc..de0c89105076 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -265,10 +265,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); -int split_huge_page_to_list(struct page *page, struct list_head *list); +int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order); static inline int split_huge_page(struct page *page) { - return split_huge_page_to_list(page, NULL); + return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio); @@ -422,7 +423,8 @@ can_split_folio(struct folio *folio, int *pextra_pins) return false; } static inline int -split_huge_page_to_list(struct page *page, struct list_head *list) +split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) { return 0; } @@ -519,17 +521,20 @@ static inline bool thp_migration_supported(void) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int split_folio_to_list(struct folio *folio, - struct list_head *list) +static inline int split_folio_to_list_to_order(struct folio *folio, + struct list_head *list, int new_order) { - return split_huge_page_to_list(&folio->page, list); + return split_huge_page_to_list_to_order(&folio->page, list, new_order); } -static inline int split_folio(struct folio *folio) +static inline int split_folio_to_order(struct folio *folio, int new_order) { - return split_folio_to_list(folio, NULL); + return split_folio_to_list_to_order(folio, NULL, new_order); } +#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) +#define split_folio(f) split_folio_to_order(f, 0) + /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to -- cgit v1.2.3 From 5ce1f4844ba0def4b1b5526d8ccea27a98e840e5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 26 Feb 2024 15:13:24 +0100 Subject: mm: remove total_mapcount() All users of total_mapcount() are gone, let's remove it. Link: https://lkml.kernel.org/r/20240226141324.278526-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f4825d82965..49e22a2f6ccc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1183,7 +1183,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes - it does not include PTE-mapped sub-pages; look - * at folio_mapcount() or page_mapcount() or total_mapcount() instead. + * at folio_mapcount() or page_mapcount() instead. */ static inline int folio_entire_mapcount(struct folio *folio) { @@ -1243,13 +1243,6 @@ static inline int folio_mapcount(struct folio *folio) return folio_total_mapcount(folio); } -static inline int total_mapcount(struct page *page) -{ - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - return folio_total_mapcount(page_folio(page)); -} - static inline bool folio_large_is_mapped(struct folio *folio) { /* -- cgit v1.2.3 From 99fbb6bfc16f202adc411ad5d353db214750d121 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:35 +0000 Subject: mm: make folios_put() the basis of release_pages() Patch series "Rearrange batched folio freeing", v3. Other than the obvious "remove calls to compound_head" changes, the fundamental belief here is that iterating a linked list is much slower than iterating an array (5-15x slower in my testing). There's also an associated belief that since we iterate the batch of folios three times, we do better when the array is small (ie 15 entries) than we do with a batch that is hundreds of entries long, which only gives us the opportunity for the first pages to fall out of cache by the time we get to the end. It is possible we should increase the size of folio_batch. Hopefully the bots let us know if this introduces any performance regressions. This patch (of 3): By making release_pages() call folios_put(), we can get rid of the calls to compound_head() for the callers that already know they have folios. We can also get rid of the lock_batch tracking as we know the size of the batch is limited by folio_batch. This does reduce the maximum number of pages for which the lruvec lock is held, from SWAP_CLUSTER_MAX (32) to PAGEVEC_SIZE (15). I do not expect this to make a significant difference, but if it does, we can increase PAGEVEC_SIZE to 31. Link: https://lkml.kernel.org/r/20240227174254.710559-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240227174254.710559-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 49e22a2f6ccc..5c57fde9b69b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,6 +36,7 @@ struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; +struct folio_batch; extern int sysctl_page_lock_unfairness; @@ -1512,6 +1513,8 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); + /* * union release_pages_arg - an array of pages or folios * @@ -1534,18 +1537,19 @@ void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. - * @nr: How many folios there are. * - * Like folio_put(), but for an array of folios. This is more efficient - * than writing the loop yourself as it will optimise the locks which - * need to be taken if the folios are freed. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ -static inline void folios_put(struct folio **folios, unsigned int nr) +static inline void folios_put(struct folio_batch *folios) { - release_pages(folios, nr); + folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) -- cgit v1.2.3 From 4882c80975e2bf7241a5b043eb1dbe8df2726a29 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:39 +0000 Subject: memcg: add mem_cgroup_uncharge_folios() Almost identical to mem_cgroup_uncharge_list(), except it takes a folio_batch instead of a list_head. Link: https://lkml.kernel.org/r/20240227174254.710559-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9a2dea92be0e..35a0d7a851f0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -721,10 +721,16 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); +void __mem_cgroup_uncharge_folios(struct folio_batch *folios); +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_folios(folios); +} +void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); - void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -1299,6 +1305,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ +} + static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { -- cgit v1.2.3 From f1ee018baee9f4e724e08859c2559323be768be3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:42 +0000 Subject: mm: use __page_cache_release() in folios_put() Pass a pointer to the lruvec so we can take advantage of the folio_lruvec_relock_irqsave(). Adjust the calling convention of folio_lruvec_relock_irqsave() to suit and add a page_cache_release() wrapper. Link: https://lkml.kernel.org/r/20240227174254.710559-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 35a0d7a851f0..b7f5e0c17de7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1705,18 +1705,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, return folio_lruvec_lock_irq(folio); } -/* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, - struct lruvec *locked_lruvec, unsigned long *flags) +/* Don't lock again iff folio's lruvec locked */ +static inline void folio_lruvec_relock_irqsave(struct folio *folio, + struct lruvec **lruvecp, unsigned long *flags) { - if (locked_lruvec) { - if (folio_matches_lruvec(folio, locked_lruvec)) - return locked_lruvec; + if (*lruvecp) { + if (folio_matches_lruvec(folio, *lruvecp)) + return; - unlock_page_lruvec_irqrestore(locked_lruvec, *flags); + unlock_page_lruvec_irqrestore(*lruvecp, *flags); } - return folio_lruvec_lock_irqsave(folio, flags); + *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From be5a9e17a2ccbecfb7020aa1938e2c62d8a9189c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:47 +0000 Subject: memcg: remove mem_cgroup_uncharge_list() All users have been converted to mem_cgroup_uncharge_folios() so we can remove this API. Link: https://lkml.kernel.org/r/20240227174254.710559-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b7f5e0c17de7..394fd0a887ae 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -713,14 +713,6 @@ static inline void mem_cgroup_uncharge(struct folio *folio) __mem_cgroup_uncharge(folio); } -void __mem_cgroup_uncharge_list(struct list_head *page_list); -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) -{ - if (mem_cgroup_disabled()) - return; - __mem_cgroup_uncharge_list(page_list); -} - void __mem_cgroup_uncharge_folios(struct folio_batch *folios); static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { @@ -1301,10 +1293,6 @@ static inline void mem_cgroup_uncharge(struct folio *folio) { } -static inline void mem_cgroup_uncharge_list(struct list_head *page_list) -{ -} - static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } -- cgit v1.2.3 From f39ec4dcb9e9b03b2a280829b8c15e3ae607398c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:49 +0000 Subject: mm: remove lru_to_page() The last user was removed over a year ago; remove the definition. Link: https://lkml.kernel.org/r/20240227174254.710559-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c57fde9b69b..d45eadc440f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -227,7 +227,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); -- cgit v1.2.3 From 63b774993dd02b17127cb404b7362fc436632995 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:52 +0000 Subject: mm: convert free_swap_cache() to take a folio All but one caller already has a folio, so convert free_page_and_swap_cache() to have a folio and remove the call to page_folio(). Link: https://lkml.kernel.org/r/20240227174254.710559-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/swap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 8d28f6091a32..1ad6f63d1a52 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -447,9 +447,9 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } -extern void free_swap_cache(struct page *page); -extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct encoded_page **, int); +void free_swap_cache(struct folio *folio); +void free_page_and_swap_cache(struct page *); +void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; @@ -531,7 +531,7 @@ static inline void put_swap_device(struct swap_info_struct *si) /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) -static inline void free_swap_cache(struct page *page) +static inline void free_swap_cache(struct folio *folio) { } -- cgit v1.2.3 From 8f8cd6c0a43ed637e620bbe45a8d0e0c2f4d5130 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 27 Feb 2024 10:35:46 +0800 Subject: modules: wait do_free_init correctly The synchronization here is to ensure the ordering of freeing of a module init so that it happens before W+X checking. It is worth noting it is not that the freeing was not happening, it is just that our sanity checkers raced against the permission checkers which assume init memory is already gone. Commit 1a7b7d922081 ("modules: Use vmalloc special flag") moved calling do_free_init() into a global workqueue instead of relying on it being called through call_rcu(..., do_free_init), which used to allowed us call do_free_init() asynchronously after the end of a subsequent grace period. The move to a global workqueue broke the gaurantees for code which needed to be sure the do_free_init() would complete with rcu_barrier(). To fix this callers which used to rely on rcu_barrier() must now instead use flush_work(&init_free_wq). Without this fix, we still could encounter false positive reports in W+X checking since the rcu_barrier() here can not ensure the ordering now. Even worse, the rcu_barrier() can introduce significant delay. Eric Chanudet reported that the rcu_barrier introduces ~0.1s delay on a PREEMPT_RT kernel. [ 0.291444] Freeing unused kernel memory: 5568K [ 0.402442] Run /sbin/init as init process With this fix, the above delay can be eliminated. Link: https://lkml.kernel.org/r/20240227023546.2490667-1-changbin.du@huawei.com Fixes: 1a7b7d922081 ("modules: Use vmalloc special flag") Signed-off-by: Changbin Du Tested-by: Eric Chanudet Acked-by: Luis Chamberlain Cc: Xiaoyi Su Signed-off-by: Andrew Morton --- include/linux/moduleloader.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 001b2ce83832..89b1e0ed9811 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -115,6 +115,14 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); +#ifdef CONFIG_MODULES +void flush_module_init_free_work(void); +#else +static inline void flush_module_init_free_work(void) +{ +} +#endif + /* Any cleanup needed when module leaves. */ void module_arch_cleanup(struct module *mod); -- cgit v1.2.3 From 345a6e2631c1267221b684e110bba03e4c26ece0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Mar 2024 17:19:45 +0000 Subject: tcp: align tcp_sock_write_rx group Stephen Rothwell and kernel test robot reported that some arches (parisc, hexagon) and/or compilers would not like blamed commit. Lets make sure tcp_sock_write_rx group does not start with a hole. While we are at it, correct tcp_sock_write_tx CACHELINE_ASSERT_GROUP_SIZE() since after the blamed commit, we went to 105 bytes. Fixes: 99123622050f ("tcp: remove some holes in struct tcp_sock") Reported-by: Stephen Rothwell Reported-by: kernel test robot Link: https://lore.kernel.org/netdev/20240301121108.5d39e4f9@canb.auug.org.au/ Closes: https://lore.kernel.org/oe-kbuild-all/202403011451.csPYOS3C-lkp@intel.com/ Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Tested-by: Simon Horman # build-tested Link: https://lore.kernel.org/r/20240301171945.2958176-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 988a30ef6bfe..55399ee2a57e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -304,7 +304,7 @@ struct tcp_sock { __cacheline_group_end(tcp_sock_write_txrx); /* RX read-write hotpath cache lines */ - __cacheline_group_begin(tcp_sock_write_rx); + __cacheline_group_begin(tcp_sock_write_rx) __aligned(8); u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes -- cgit v1.2.3 From 411c5f36805c02c7c412f1ad6bfa4459a1148011 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 28 Feb 2024 17:30:08 +0800 Subject: mm/page_alloc: modify page_frag_alloc_align() to accept align as an argument napi_alloc_frag_align() and netdev_alloc_frag_align() accept align as an argument, and they are thin wrappers around the __napi_alloc_frag_align() and __netdev_alloc_frag_align() APIs doing the alignment checking and align mask conversion, in order to call page_frag_alloc_align() directly. The intention here is to keep the alignment checking and the alignmask conversion in in-line wrapper to avoid those kind of operations during execution time since it can usually be handled during compile time. We are going to use page_frag_alloc_align() in vhost_net.c, it need the same kind of alignment checking and alignmask conversion, so split up page_frag_alloc_align into an inline wrapper doing the above operation, and add __page_frag_alloc_align() which is passed with the align mask the original function expected as suggested by Alexander. Signed-off-by: Yunsheng Lin CC: Alexander Duyck Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- include/linux/gfp.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..28aea17fa59b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -312,14 +312,21 @@ extern void free_pages(unsigned long addr, unsigned int order); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); -extern void *page_frag_alloc_align(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask); +void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, + gfp_t gfp_mask, unsigned int align_mask); + +static inline void *page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align) +{ + WARN_ON_ONCE(!is_power_of_2(align)); + return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align); +} static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { - return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); + return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); } extern void page_frag_free(void *addr); -- cgit v1.2.3 From a0727489ac22d6fbd2e390d38a51193bba61da83 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 28 Feb 2024 17:30:10 +0800 Subject: net: introduce page_frag_cache_drain() When draining a page_frag_cache, most user are doing the similar steps, so introduce an API to avoid code duplication. Signed-off-by: Yunsheng Lin Acked-by: Jason Wang Reviewed-by: Alexander Duyck Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- include/linux/gfp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 28aea17fa59b..6cef1c241180 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -311,6 +311,7 @@ extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); struct page_frag_cache; +void page_frag_cache_drain(struct page_frag_cache *nc); extern void __page_frag_cache_drain(struct page *page, unsigned int count); void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask); -- cgit v1.2.3 From 88803989ff6d73155969df94cc84f1ecd28c4d9c Mon Sep 17 00:00:00 2001 From: Yang Xiwen Date: Thu, 29 Feb 2024 09:36:19 +0800 Subject: mmc: core: Use a struct device* as in-param to mmc_of_parse_clk_phase() Parsing dt usually happens very early, sometimes even before the struct mmc_host has been allocated (e.g. dw_mci_probe() and dw_mci_parse_dt() in dw_mmc.c). Looking at the source of mmc_of_parse_clk_phase(), it's actually not needed to have an initialized mmc_host, let's therefore pass a struct device* to it instead. Also update the only current user, sdhci-of-aspeed. Reviewed-by: Paul Menzel Acked-by: Andrew Jeffery Signed-off-by: Yang Xiwen Link: https://lore.kernel.org/r/20240229-b4-mmc-hi3798mv200-v7-1-10c03f316285@outlook.com Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 2f445c651742..5894bf912f7b 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -539,7 +539,7 @@ struct mmc_host *devm_mmc_alloc_host(struct device *dev, int extra); int mmc_add_host(struct mmc_host *); void mmc_remove_host(struct mmc_host *); void mmc_free_host(struct mmc_host *); -void mmc_of_parse_clk_phase(struct mmc_host *host, +void mmc_of_parse_clk_phase(struct device *dev, struct mmc_clk_phase_map *map); int mmc_of_parse(struct mmc_host *host); int mmc_of_parse_voltage(struct mmc_host *host, u32 *mask); -- cgit v1.2.3 From 885c36e59f46375c138de18ff1692f18eff67b7f Mon Sep 17 00:00:00 2001 From: Abhishek Chauhan Date: Fri, 1 Mar 2024 12:13:48 -0800 Subject: net: Re-use and set mono_delivery_time bit for userspace tstamp packets Bridge driver today has no support to forward the userspace timestamp packets and ends up resetting the timestamp. ETF qdisc checks the packet coming from userspace and encounters to be 0 thereby dropping time sensitive packets. These changes will allow userspace timestamps packets to be forwarded from the bridge to NIC drivers. Setting the same bit (mono_delivery_time) to avoid dropping of userspace tstamp packets in the forwarding path. Existing functionality of mono_delivery_time remains unaltered here, instead just extended with userspace tstamp support for bridge forwarding path. Signed-off-by: Abhishek Chauhan Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240301201348.2815102-1-quic_abchauha@quicinc.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d577e0bee18d..3013355b63f5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -822,9 +822,9 @@ typedef unsigned char *sk_buff_data_t; * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required * @mono_delivery_time: When set, skb->tstamp has the - * delivery_time in mono clock base (i.e. EDT). Otherwise, the - * skb->tstamp has the (rcv) timestamp at ingress and - * delivery_time at egress. + * delivery_time in mono clock base (i.e., EDT) or a clock base chosen + * by SO_TXTIME. If zero, skb->tstamp has the (rcv) timestamp at + * ingress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. -- cgit v1.2.3 From d28240785e00ffb889d90368882bf382e22e9555 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 29 Feb 2024 14:17:33 +0100 Subject: usb: typec: pd: no opencoding of FIELD_GET If we have a neat macro, at least new code should use it. Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20240229131851.16148-2-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd_vdo.h | 5 +++-- include/linux/usb/typec_dp.h | 11 ++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/pd_vdo.h b/include/linux/usb/pd_vdo.h index c09c5a12e273..5c48e8a81403 100644 --- a/include/linux/usb/pd_vdo.h +++ b/include/linux/usb/pd_vdo.h @@ -7,6 +7,7 @@ #define __LINUX_USB_PD_VDO_H #include "pd.h" +#include /* * VDO : Vendor Defined Message Object @@ -188,7 +189,7 @@ * <5:3> :: Alternate modes * <2:0> :: USB highest speed */ -#define PD_VDO_UFP_DEVCAP(vdo) (((vdo) & GENMASK(27, 24)) >> 24) +#define PD_VDO_UFP_DEVCAP(vdo) FIELD_GET(GENMASK(27, 24), vdo) /* UFP VDO Version */ #define UFP_VDO_VER1_2 2 @@ -247,7 +248,7 @@ * <21:5> :: Reserved * <4:0> :: Port number */ -#define PD_VDO_DFP_HOSTCAP(vdo) (((vdo) & GENMASK(26, 24)) >> 24) +#define PD_VDO_DFP_HOSTCAP(vdo) FIELD_GET(GENMASK(26, 24), vdo) #define DFP_VDO_VER1_1 1 #define HOST_USB2_CAPABLE BIT(0) diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h index 1f358098522d..f2da264d9c14 100644 --- a/include/linux/usb/typec_dp.h +++ b/include/linux/usb/typec_dp.h @@ -3,6 +3,7 @@ #define __USB_TYPEC_DP_H #include +#include #define USB_TYPEC_DP_SID 0xff01 /* USB IF has not assigned a Standard ID (SID) for VirtualLink, @@ -67,21 +68,21 @@ enum { #define DP_CAP_UFP_D 1 #define DP_CAP_DFP_D 2 #define DP_CAP_DFP_D_AND_UFP_D 3 -#define DP_CAP_DP_SIGNALLING(_cap_) (((_cap_) & GENMASK(5, 2)) >> 2) +#define DP_CAP_DP_SIGNALLING(_cap_) FIELD_GET(GENMASK(5, 2), _cap_) #define DP_CAP_SIGNALLING_HBR3 1 #define DP_CAP_SIGNALLING_UHBR10 2 #define DP_CAP_SIGNALLING_UHBR20 3 #define DP_CAP_RECEPTACLE BIT(6) #define DP_CAP_USB BIT(7) -#define DP_CAP_DFP_D_PIN_ASSIGN(_cap_) (((_cap_) & GENMASK(15, 8)) >> 8) -#define DP_CAP_UFP_D_PIN_ASSIGN(_cap_) (((_cap_) & GENMASK(23, 16)) >> 16) +#define DP_CAP_DFP_D_PIN_ASSIGN(_cap_) FIELD_GET(GENMASK(15, 8), _cap_) +#define DP_CAP_UFP_D_PIN_ASSIGN(_cap_) FIELD_GET(GENMASK(23, 16), _cap_) /* Get pin assignment taking plug & receptacle into consideration */ #define DP_CAP_PIN_ASSIGN_UFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \ DP_CAP_UFP_D_PIN_ASSIGN(_cap_) : DP_CAP_DFP_D_PIN_ASSIGN(_cap_)) #define DP_CAP_PIN_ASSIGN_DFP_D(_cap_) ((_cap_ & DP_CAP_RECEPTACLE) ? \ DP_CAP_DFP_D_PIN_ASSIGN(_cap_) : DP_CAP_UFP_D_PIN_ASSIGN(_cap_)) #define DP_CAP_UHBR_13_5_SUPPORT BIT(26) -#define DP_CAP_CABLE_TYPE(_cap_) (((_cap_) & GENMASK(29, 28)) >> 28) +#define DP_CAP_CABLE_TYPE(_cap_) FIELD_GET(GENMASK(29, 28), _cap_) #define DP_CAP_CABLE_TYPE_PASSIVE 0 #define DP_CAP_CABLE_TYPE_RE_TIMER 1 #define DP_CAP_CABLE_TYPE_RE_DRIVER 2 @@ -116,7 +117,7 @@ enum { /* Helper for setting/getting the pin assignment value to the configuration */ #define DP_CONF_SET_PIN_ASSIGN(_a_) ((_a_) << 8) -#define DP_CONF_GET_PIN_ASSIGN(_conf_) (((_conf_) & GENMASK(15, 8)) >> 8) +#define DP_CONF_GET_PIN_ASSIGN(_conf_) FIELD_GET(GENMASK(15, 8), _conf_) #define DP_CONF_UHBR13_5_SUPPORT BIT(26) #define DP_CONF_CABLE_TYPE_MASK GENMASK(29, 28) #define DP_CONF_CABLE_TYPE_SHIFT 28 -- cgit v1.2.3 From 82e82130a78b75a9ce5225df24d5a0b1b3290eb0 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 22 Feb 2024 16:58:21 -0800 Subject: usb: core: Set connect_type of ports based on DT node When a USB hub is described in DT, such as any device that matches the onboard-hub driver, the connect_type is set to "unknown" or USB_PORT_CONNECT_TYPE_UNKNOWN. This makes any device plugged into that USB port report their 'removable' device attribute as "unknown". ChromeOS userspace would like to know if the USB device is actually removable or not so that security policies can be applied. Improve the connect_type attribute for ports, and in turn the removable attribute for USB devices, by looking for child devices with a reg property or an OF graph when the device is described in DT. If the graph exists, endpoints that are connected to a remote node must be something like a usb-{a,b,c}-connector compatible node, or an intermediate node like a redriver, and not a hardwired USB device on the board. Set the connect_type to USB_PORT_CONNECT_TYPE_HOT_PLUG in this case because the device is going to be plugged in. Set the connect_type to USB_PORT_CONNECT_TYPE_HARD_WIRED if there's a child node for the port like 'device@2' for port2. Set the connect_type to USB_PORT_NOT_USED if there isn't an endpoint or child node corresponding to the port number. To make sure things don't change, only set the port to not used if there are child nodes. This way an onboard hub connect_type doesn't change until ports are added or child nodes are added to describe hardwired devices. It's assumed that all ports or no ports will be described for a device. Cc: Matthias Kaehlcke Cc: linux-usb@vger.kernel.org Cc: devicetree@vger.kernel.org Cc: Pin-yen Lin Cc: maciek swiech Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240223005823.3074029-3-swboyd@chromium.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/of.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/of.h b/include/linux/usb/of.h index 98487fd7ab11..de42f14bd280 100644 --- a/include/linux/usb/of.h +++ b/include/linux/usb/of.h @@ -6,6 +6,7 @@ #ifndef __LINUX_USB_OF_H #define __LINUX_USB_OF_H +#include #include #include #include @@ -17,6 +18,7 @@ enum usb_dr_mode of_usb_get_dr_mode_by_phy(struct device_node *np, int arg0); bool of_usb_host_tpl_support(struct device_node *np); int of_usb_update_otg_caps(struct device_node *np, struct usb_otg_caps *otg_caps); +enum usb_port_connect_type usb_of_get_connect_type(struct usb_device *hub, int port1); struct device_node *usb_of_get_device_node(struct usb_device *hub, int port1); bool usb_of_has_combined_node(struct usb_device *udev); struct device_node *usb_of_get_interface_node(struct usb_device *udev, @@ -37,6 +39,11 @@ static inline int of_usb_update_otg_caps(struct device_node *np, { return 0; } +static inline enum usb_port_connect_type +usb_of_get_connect_type(const struct usb_device *hub, int port1) +{ + return USB_PORT_CONNECT_TYPE_UNKNOWN; +} static inline struct device_node * usb_of_get_device_node(struct usb_device *hub, int port1) { -- cgit v1.2.3 From ac92ea6b656374abab230f9f38fd3f0ab6cd0d61 Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Thu, 22 Feb 2024 22:09:02 +0100 Subject: usb: typec: tcpm: add support to set tcpc connector orientatition This adds the support to set the connector orientation value accordingly. This is part of the optional CONFIG_STANDARD_OUTPUT register 0x18, specified within the USB port controller spsicification rev. 2.0 [1]. [1] https://www.usb.org/sites/default/files/documents/usb-port_controller_specification_rev2.0_v1.0_0.pdf Signed-off-by: Marco Felsch Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240222210903.208901-4-m.felsch@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index 6671427f7eeb..061da9546a81 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -144,6 +144,8 @@ struct tcpc_dev { enum typec_cc_status *cc2); int (*set_polarity)(struct tcpc_dev *dev, enum typec_cc_polarity polarity); + int (*set_orientation)(struct tcpc_dev *dev, + enum typec_orientation orientation); int (*set_vconn)(struct tcpc_dev *dev, bool on); int (*set_vbus)(struct tcpc_dev *dev, bool on, bool charge); int (*set_current_limit)(struct tcpc_dev *dev, u32 max_ma, u32 mv); -- cgit v1.2.3 From 7bfb915a597a301abb892f620fe5c283a9fdbd77 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 3 Mar 2024 16:08:07 +0100 Subject: serial: core: only stop transmit when HW fifo is empty If the circular buffer is empty, it just means we fit all characters to send into the HW fifo, but not that the hardware finished transmitting them. So if we immediately call stop_tx() after that, this may abort any pending characters in the HW fifo, and cause dropped characters on the console. Fix this by only stopping tx when the tx HW fifo is actually empty. Fixes: 8275b48b2780 ("tty: serial: introduce transmit helpers") Cc: stable@vger.kernel.org Signed-off-by: Jonas Gorski Link: https://lore.kernel.org/r/20240303150807.68117-1-jonas.gorski@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 55b1f3ba48ac..bb0f2d4ac62f 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -786,7 +786,8 @@ enum UART_TX_FLAGS { if (pending < WAKEUP_CHARS) { \ uart_write_wakeup(__port); \ \ - if (!((flags) & UART_TX_NOSTOP) && pending == 0) \ + if (!((flags) & UART_TX_NOSTOP) && pending == 0 && \ + __port->ops->tx_empty(__port)) \ __port->ops->stop_tx(__port); \ } \ \ -- cgit v1.2.3 From 35c822a34b2293aedf475238c395e75858d1e8c8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Mar 2024 14:27:02 +0200 Subject: serial: core: Move struct uart_port::quirks closer to possible values Currently it's not crystal clear what UPIO_* and UPQ_* definitions belong to. Reindent the code, so it will be easy to read and understand. No functional changes intended. Reviewed-by: Andi Shyti Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240304123035.758700-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index bb0f2d4ac62f..f9d7f0a625fd 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -467,8 +467,8 @@ struct uart_port { unsigned int fifosize; /* tx fifo size */ unsigned char x_char; /* xon/xoff char */ unsigned char regshift; /* reg offset shift */ + unsigned char iotype; /* io access style */ - unsigned char quirks; /* internal quirks */ #define UPIO_PORT (SERIAL_IO_PORT) /* 8b I/O port access */ #define UPIO_HUB6 (SERIAL_IO_HUB6) /* Hub6 ISA card */ @@ -479,7 +479,9 @@ struct uart_port { #define UPIO_MEM32BE (SERIAL_IO_MEM32BE) /* 32b big endian */ #define UPIO_MEM16 (SERIAL_IO_MEM16) /* 16b little endian */ - /* quirks must be updated while holding port mutex */ + unsigned char quirks; /* internal quirks */ + + /* internal quirks must be updated while holding port mutex */ #define UPQ_NO_TXEN_TEST BIT(0) unsigned int read_status_mask; /* driver specific */ -- cgit v1.2.3 From 79d713baf63c8f23cc58b304c40be33d64a12aaf Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Mar 2024 14:27:03 +0200 Subject: serial: core: Add UPIO_UNKNOWN constant for unknown port type In some APIs we would like to assign the special value to iotype and compare against it in another places. Introduce UPIO_UNKNOWN for this purpose. Note, we can't use 0, because it's a valid value for IO port access. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240304123035.758700-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index f9d7f0a625fd..3b64c9a26945 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -470,6 +470,7 @@ struct uart_port { unsigned char iotype; /* io access style */ +#define UPIO_UNKNOWN ((unsigned char)~0U) /* UCHAR_MAX */ #define UPIO_PORT (SERIAL_IO_PORT) /* 8b I/O port access */ #define UPIO_HUB6 (SERIAL_IO_HUB6) /* Hub6 ISA card */ #define UPIO_MEM (SERIAL_IO_MEM) /* driver-specific */ -- cgit v1.2.3 From e894b6005dce0ed621b2788d6a249708fb6f95f9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 4 Mar 2024 14:27:04 +0200 Subject: serial: port: Introduce a common helper to read properties Several serial drivers want to read the same or similar set of the port properties. Make a common helper for them. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240304123035.758700-4-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 3b64c9a26945..0a0f6e21d40e 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -963,6 +963,8 @@ int uart_register_driver(struct uart_driver *uart); void uart_unregister_driver(struct uart_driver *uart); int uart_add_one_port(struct uart_driver *reg, struct uart_port *port); void uart_remove_one_port(struct uart_driver *reg, struct uart_port *port); +int uart_read_port_properties(struct uart_port *port); +int uart_read_and_validate_port_properties(struct uart_port *port); bool uart_match_port(const struct uart_port *port1, const struct uart_port *port2); -- cgit v1.2.3 From 7ba59ac7da2aae2fbcfe90352ee40d30cecca10d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Mar 2024 13:19:45 -0800 Subject: greybus: Avoid fake flexible array for response data FORTIFY_SOURCE has been ignoring 0-sized destinations while the kernel code base has been converted to flexible arrays. In order to enforce the 0-sized destinations (e.g. with __counted_by), the remaining 0-sized destinations need to be handled. Instead of converting an empty struct into using a flexible array, just directly use a pointer without any additional indirection. Remove struct gb_bootrom_get_firmware_response and struct gb_fw_download_fetch_firmware_response. Signed-off-by: Kees Cook Reviewed-by: Alex Elder Link: https://lore.kernel.org/r/20240304211940.it.083-kees@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/greybus/greybus_protocols.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/greybus/greybus_protocols.h b/include/linux/greybus/greybus_protocols.h index aeb8f9243545..820134b0105c 100644 --- a/include/linux/greybus/greybus_protocols.h +++ b/include/linux/greybus/greybus_protocols.h @@ -232,9 +232,7 @@ struct gb_fw_download_fetch_firmware_request { __le32 size; } __packed; -struct gb_fw_download_fetch_firmware_response { - __u8 data[0]; -} __packed; +/* gb_fw_download_fetch_firmware_response contains no other data */ /* firmware download release firmware request */ struct gb_fw_download_release_firmware_request { @@ -414,9 +412,7 @@ struct gb_bootrom_get_firmware_request { __le32 size; } __packed; -struct gb_bootrom_get_firmware_response { - __u8 data[0]; -} __packed; +/* gb_bootrom_get_firmware_response contains no other data */ /* Bootrom protocol Ready to boot request */ struct gb_bootrom_ready_to_boot_request { -- cgit v1.2.3 From 68ade0976df7979eac5f1d46320ff798f5043af6 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Fri, 1 Mar 2024 23:58:26 +0100 Subject: power: supply: core: add power_supply_for_each_device() Introduce power_supply_for_each_device(), which is a wrapper for class_for_each_device() using the power_supply_class and going through all devices. This allows making the power_supply_class itself a local variable, so that drivers cannot mess with it and simplifies the code slightly. Reviewed-by: Ricardo B. Marliere Link: https://lore.kernel.org/r/20240301-psy-class-cleanup-v1-1-aebe8c4b6b08@collabora.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 514f652de64d..92dd205774ec 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -894,8 +894,7 @@ extern int power_supply_powers(struct power_supply *psy, struct device *dev); #define to_power_supply(device) container_of(device, struct power_supply, dev) extern void *power_supply_get_drvdata(struct power_supply *psy); -/* For APM emulation, think legacy userspace. */ -extern const struct class power_supply_class; +extern int power_supply_for_each_device(void *data, int (*fn)(struct device *dev, void *data)); static inline bool power_supply_is_amp_property(enum power_supply_property psp) { -- cgit v1.2.3 From 4e61f1e9d58fb0765f59f47d4d1f318b36c14d95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sun, 3 Mar 2024 16:31:15 +0100 Subject: power: supply: core: fix charge_behaviour formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This property is documented to have a special format which exposes all available behaviours and the currently active one at the same time. For this special format some helpers are provided. When the charge_behaviour property was added in 1b0b6cc8030d ("power: supply: add charge_behaviour attributes"), it did not update the default logic in in power_supply_sysfs.c to use the format helpers. Thus by default only the currently active behaviour is printed. This fixes the default logic to follow the documented format. There is currently only one in-tree drivers exposing charge behaviours - thinkpad_acpi, which is not affected by the change, as it directly uses the helpers and does not use the power_supply_sysfs.c logic. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20240303-power_supply-charge_behaviour_prop-v2-3-8ebb0a7c2409@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 92dd205774ec..8e5705a56b85 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -242,6 +242,7 @@ struct power_supply_config { struct power_supply_desc { const char *name; enum power_supply_type type; + u8 charge_behaviours; const enum power_supply_usb_type *usb_types; size_t num_usb_types; const enum power_supply_property *properties; -- cgit v1.2.3 From 289e922582af5b4721ba02e86bde4d9ba918158a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Mar 2024 17:35:32 -0800 Subject: dpll: move all dpll<>netdev helpers to dpll code Older versions of GCC really want to know the full definition of the type involved in rcu_assign_pointer(). struct dpll_pin is defined in a local header, net/core can't reach it. Move all the netdev <> dpll code into dpll, where the type is known. Otherwise we'd need multiple function calls to jump between the compilation units. This is the same problem the commit under fixes was trying to address, but with rcu_assign_pointer() not rcu_dereference(). Some of the exports are not needed, networking core can't be a module, we only need exports for the helpers used by drivers. Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/all/35a869c8-52e8-177-1d4d-e57578b99b6@linux-m68k.org/ Fixes: 640f41ed33b5 ("dpll: fix build failure due to rcu_dereference_check() on unknown type") Reviewed-by: Jiri Pirko Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240305013532.694866-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 26 +++++++++++++------------- include/linux/netdevice.h | 4 ---- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index c60591308ae8..e37344f6a231 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -122,15 +122,24 @@ struct dpll_pin_properties { }; #if IS_ENABLED(CONFIG_DPLL) -size_t dpll_msg_pin_handle_size(struct dpll_pin *pin); -int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin); +void dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); +void dpll_netdev_pin_clear(struct net_device *dev); + +size_t dpll_netdev_pin_handle_size(const struct net_device *dev); +int dpll_netdev_add_pin_handle(struct sk_buff *msg, + const struct net_device *dev); #else -static inline size_t dpll_msg_pin_handle_size(struct dpll_pin *pin) +static inline void +dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) { } +static inline void dpll_netdev_pin_clear(struct net_device *dev) { } + +static inline size_t dpll_netdev_pin_handle_size(const struct net_device *dev) { return 0; } -static inline int dpll_msg_add_pin_handle(struct sk_buff *msg, struct dpll_pin *pin) +static inline int +dpll_netdev_add_pin_handle(struct sk_buff *msg, const struct net_device *dev) { return 0; } @@ -169,13 +178,4 @@ int dpll_device_change_ntf(struct dpll_device *dpll); int dpll_pin_change_ntf(struct dpll_pin *pin); -#if !IS_ENABLED(CONFIG_DPLL) -static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev) -{ - return NULL; -} -#else -struct dpll_pin *netdev_dpll_pin(const struct net_device *dev); -#endif - #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 735a9386fcf8..78a09af89e39 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -79,8 +79,6 @@ struct xdp_buff; struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; -/* DPLL specific */ -struct dpll_pin; typedef u32 xdp_features_t; @@ -4042,8 +4040,6 @@ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); -void netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin); -void netdev_dpll_pin_clear(struct net_device *dev); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From e3b6876ab85061e7de198f023a0c2bfc7478b420 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 2 Mar 2024 20:53:01 +0100 Subject: net: phy: Add phydev->enable_tx_lpi to simplify adjust link callbacks MAC drivers which support EEE need to know the results of the EEE auto-neg in order to program the hardware to perform EEE or not. The oddly named phy_init_eee() can be used to determine this, it returns 0 if EEE should be used, or a negative error code, e.g. -EOPPROTONOTSUPPORT if the PHY does not support EEE or negotiate resulted in it not being used. However, many MAC drivers get this wrong. Add phydev->enable_tx_lpi which indicates the result of the autoneg for EEE, including if EEE is administratively disabled with ethtool. The MAC driver can then access this in the same way as link speed and duplex in the adjust link callback. If enable_tx_lpi is true, the MAC should send low power indications and does not need to consider anything else with respect to EEE. Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240302195306.3207716-3-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index e3ab2c347a59..a880f6d7170e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -594,6 +594,7 @@ struct macsec_ops; * @supported_eee: supported PHY EEE linkmodes * @advertising_eee: Currently advertised EEE linkmodes * @eee_enabled: Flag indicating whether the EEE feature is enabled + * @enable_tx_lpi: When True, MAC should transmit LPI to PHY * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited @@ -713,6 +714,7 @@ struct phy_device { /* Energy efficient ethernet modes which should be prohibited */ u32 eee_broken_modes; + bool enable_tx_lpi; #ifdef CONFIG_LED_TRIGGER_PHY struct phy_led_trigger *phy_led_triggers; -- cgit v1.2.3 From fe0d4fd9285e5013b4bafbd3338847235b805a1c Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 2 Mar 2024 20:53:02 +0100 Subject: net: phy: Keep track of EEE configuration Have phylib keep track of the EEE configuration. This simplifies the MAC drivers, in that they don't need to store it. Future patches to phylib will also make use of this information to further simplify the MAC drivers. Reviewed-by: Russell King (Oracle) Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240302195306.3207716-4-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index a880f6d7170e..695e366bd75c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -30,6 +30,7 @@ #include #include +#include #define PHY_DEFAULT_FEATURES (SUPPORTED_Autoneg | \ SUPPORTED_TP | \ @@ -595,6 +596,7 @@ struct macsec_ops; * @advertising_eee: Currently advertised EEE linkmodes * @eee_enabled: Flag indicating whether the EEE feature is enabled * @enable_tx_lpi: When True, MAC should transmit LPI to PHY + * @eee_cfg: User configuration of EEE * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited @@ -715,6 +717,7 @@ struct phy_device { /* Energy efficient ethernet modes which should be prohibited */ u32 eee_broken_modes; bool enable_tx_lpi; + struct eee_config eee_cfg; #ifdef CONFIG_LED_TRIGGER_PHY struct phy_led_trigger *phy_led_triggers; -- cgit v1.2.3 From 49168d1980e220cf3d1b761e1eafac62041cb94d Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 2 Mar 2024 20:53:04 +0100 Subject: net: phy: Add phy_support_eee() indicating MAC support EEE In order for EEE to operate, both the MAC and the PHY need to support it, similar to how pause works. With some exception - a number of PHYs have SmartEEE or AutoGrEEEn support in order to provide some EEE-like power savings with non-EEE capable MACs. Copy the pause concept and add the call phy_support_eee() which the MAC makes after connecting the PHY to indicate it supports EEE. phylib will then advertise EEE when auto-neg is performed. Signed-off-by: Andrew Lunn Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20240302195306.3207716-6-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 695e366bd75c..3f68b8239bb1 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -706,7 +706,7 @@ struct phy_device { __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); /* used with phy_speed_down */ __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); - /* used for eee validation */ + /* used for eee validation and configuration*/ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); bool eee_enabled; @@ -1973,6 +1973,7 @@ void phy_advertise_supported(struct phy_device *phydev); void phy_advertise_eee_all(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); +void phy_support_eee(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg); void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx); -- cgit v1.2.3 From ad86f7e959dc1814c3b2bcbeb08c3c02214110a7 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 22 Feb 2024 14:56:59 +0100 Subject: firmware: arm_scmi: Populate perf commands rate_limit Arm SCMI spec. v3.2, s4.5.3.4 PERFORMANCE_DOMAIN_ATTRIBUTES defines a per-domain rate_limit for performance requests: """ Rate Limit in microseconds, indicating the minimum time required between successive requests. A value of 0 indicates that this field is not supported by the platform. This field does not apply to FastChannels. """" The field is first defined in SCMI v1.0. Add support to fetch this value and advertise it through a rate_limit_get() callback. Signed-off-by: Pierre Gondois Reviewed-by: Cristian Marussi Acked-by: Sudeep Holla Signed-off-by: Viresh Kumar --- include/linux/scmi_protocol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index f2f05fb42d28..acd956ffcb84 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -128,6 +128,8 @@ struct scmi_perf_domain_info { * @level_set: sets the performance level of a domain * @level_get: gets the performance level of a domain * @transition_latency_get: gets the DVFS transition latency for a given device + * @rate_limit_get: gets the minimum time (us) required between successive + * requests * @device_opps_add: adds all the OPPs for a given device * @freq_set: sets the frequency for a given device using sustained frequency * to sustained performance level mapping @@ -154,6 +156,8 @@ struct scmi_perf_proto_ops { u32 *level, bool poll); int (*transition_latency_get)(const struct scmi_protocol_handle *ph, u32 domain); + int (*rate_limit_get)(const struct scmi_protocol_handle *ph, + u32 domain, u32 *rate_limit); int (*device_opps_add)(const struct scmi_protocol_handle *ph, struct device *dev, u32 domain); int (*freq_set)(const struct scmi_protocol_handle *ph, u32 domain, -- cgit v1.2.3 From 2441caa84aac8abf1be9e20db3e6bb921e74c8a2 Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Thu, 22 Feb 2024 14:57:00 +0100 Subject: firmware: arm_scmi: Populate fast channel rate_limit Arm SCMI spec. v3.2, s4.5.3.12 PERFORMANCE_DESCRIBE_FASTCHANNEL defines a per-domain rate_limit for performance requests: """ Rate Limit in microseconds, indicating the minimum time required between successive requests. A value of 0 indicates that this field is not applicable or supported on the platform. """" The field is first defined in SCMI v2.0. Add support to fetch this value and advertise it through a fast_switch_rate_limit() callback. Signed-off-by: Pierre Gondois Reviewed-by: Cristian Marussi Acked-by: Sudeep Holla Signed-off-by: Viresh Kumar --- include/linux/scmi_protocol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index acd956ffcb84..fafedb3b6604 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -139,6 +139,8 @@ struct scmi_perf_domain_info { * at a given frequency * @fast_switch_possible: indicates if fast DVFS switching is possible or not * for a given device + * @fast_switch_rate_limit: gets the minimum time (us) required between + * successive fast_switching requests * @power_scale_mw_get: indicates if the power values provided are in milliWatts * or in some other (abstract) scale */ @@ -168,6 +170,8 @@ struct scmi_perf_proto_ops { unsigned long *rate, unsigned long *power); bool (*fast_switch_possible)(const struct scmi_protocol_handle *ph, u32 domain); + int (*fast_switch_rate_limit)(const struct scmi_protocol_handle *ph, + u32 domain, u32 *rate_limit); enum scmi_power_scale (*power_scale_get)(const struct scmi_protocol_handle *ph); }; -- cgit v1.2.3 From a50026bdb867c8caf9d29e18f9fe9e1390312619 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 5 Mar 2024 21:33:36 +0800 Subject: iov_iter: get rid of 'copy_mc' flag This flag is only set by one single user: the magical core dumping code that looks up user pages one by one, and then writes them out using their kernel addresses (by using a BVEC_ITER). That actually ends up being a huge problem, because while we do use copy_mc_to_kernel() for this case and it is able to handle the possible machine checks involved, nothing else is really ready to handle the failures caused by the machine check. In particular, as reported by Tong Tiangen, we don't actually support fault_in_iov_iter_readable() on a machine check area. As a result, the usual logic for writing things to a file under a filesystem lock, which involves doing a copy with page faults disabled and then if that fails trying to fault pages in without holding the locks with fault_in_iov_iter_readable() does not work at all. We could decide to always just make the MC copy "succeed" (and filling the destination with zeroes), and that would then create a core dump file that just ignores any machine checks. But honestly, this single special case has been problematic before, and means that all the normal iov_iter code ends up slightly more complex and slower. See for example commit c9eec08bac96 ("iov_iter: Don't deal with iter->copy_mc in memcpy_from_iter_mc()") where David Howells re-organized the code just to avoid having to check the 'copy_mc' flags inside the inner iov_iter loops. So considering that we have exactly one user, and that one user is a non-critical special case that doesn't actually ever trigger in real life (Tong found this with manual error injection), the sane solution is to just decide that the onus on handling the machine check lines on that user instead. Ergo, do the copy_mc_to_kernel() in the core dump logic itself, copying the user data to a stable kernel page before writing it out. Fixes: f1982740f5e7 ("iov_iter: Convert iterate*() to inline funcs") Signed-off-by: Linus Torvalds Signed-off-by: Tong Tiangen Link: https://lore.kernel.org/r/20240305133336.3804360-1-tongtiangen@huawei.com Link: https://lore.kernel.org/all/4e80924d-9c85-f13a-722a-6a5d2b1c225a@huawei.com/ Tested-by: David Howells Reviewed-by: David Howells Reviewed-by: Jens Axboe Reported-by: Tong Tiangen Signed-off-by: Christian Brauner --- include/linux/uio.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index bea9c89922d9..00cebe2b70de 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -40,7 +40,6 @@ struct iov_iter_state { struct iov_iter { u8 iter_type; - bool copy_mc; bool nofault; bool data_source; size_t iov_offset; @@ -248,22 +247,8 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); -static inline void iov_iter_set_copy_mc(struct iov_iter *i) -{ - i->copy_mc = true; -} - -static inline bool iov_iter_is_copy_mc(const struct iov_iter *i) -{ - return i->copy_mc; -} #else #define _copy_mc_to_iter _copy_to_iter -static inline void iov_iter_set_copy_mc(struct iov_iter *i) { } -static inline bool iov_iter_is_copy_mc(const struct iov_iter *i) -{ - return false; -} #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); @@ -355,7 +340,6 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, - .copy_mc = false, .data_source = direction, .ubuf = buf, .count = count, -- cgit v1.2.3 From 9fe0c03f0bfc5f74dad6e818090ab967d8603095 Mon Sep 17 00:00:00 2001 From: Vicki Pfau Date: Tue, 5 Mar 2024 18:08:27 -0800 Subject: fsnotify: Fix misspelling of "writable" Several file system notification system headers have "writable" misspelled as "writtable" in the comments. This patch fixes it in the fsnotify header. Signed-off-by: Vicki Pfau Signed-off-by: Jan Kara Message-Id: <20240306020831.1404033-2-vi@endrift.com> --- include/linux/fsnotify_backend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7f63be5ca0f1..8f40c349b228 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -31,8 +31,8 @@ #define FS_ACCESS 0x00000001 /* File was accessed */ #define FS_MODIFY 0x00000002 /* File was modified */ #define FS_ATTRIB 0x00000004 /* Metadata changed */ -#define FS_CLOSE_WRITE 0x00000008 /* Writtable file was closed */ -#define FS_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */ +#define FS_CLOSE_WRITE 0x00000008 /* Writable file was closed */ +#define FS_CLOSE_NOWRITE 0x00000010 /* Unwritable file closed */ #define FS_OPEN 0x00000020 /* File was opened */ #define FS_MOVED_FROM 0x00000040 /* File was moved from X */ #define FS_MOVED_TO 0x00000080 /* File was moved to Y */ -- cgit v1.2.3 From 00b9850e7307ff690639dd0584a50dd8a72d3548 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Jan 2024 11:16:17 +0100 Subject: greybus: make greybus_bus_type const Now that the driver core can properly handle constant struct bus_type, move the greybus_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Alex Elder Cc: greybus-dev@lists.linaro.org Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/2024010517-handgun-scoreless-05e7@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/greybus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/greybus.h b/include/linux/greybus.h index 18c0fb958b74..92da9ec4f5f0 100644 --- a/include/linux/greybus.h +++ b/include/linux/greybus.h @@ -104,7 +104,7 @@ void gb_debugfs_init(void); void gb_debugfs_cleanup(void); struct dentry *gb_debugfs_get(void); -extern struct bus_type greybus_bus_type; +extern const struct bus_type greybus_bus_type; extern struct device_type greybus_hd_type; extern struct device_type greybus_module_type; -- cgit v1.2.3 From e869b72b33731063b50433eb6146d51a479995a1 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Mon, 19 Feb 2024 09:40:50 -0300 Subject: greybus: constify the struct device_type usage Since commit aed65af1cc2f ("drivers: make device_type const"), the driver core can properly handle constant struct device_type. Move the greybus_hd_type, greybus_module_type, greybus_interface_type, greybus_control_type, greybus_bundle_type and greybus_svc_type variables to be constant structures as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: "Ricardo B. Marliere" Reviewed-by: Alex Elder Link: https://lore.kernel.org/r/20240219-device_cleanup-greybus-v1-1-babb3f65e8cc@marliere.net Signed-off-by: Greg Kroah-Hartman --- include/linux/greybus.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/greybus.h b/include/linux/greybus.h index 92da9ec4f5f0..2cc570ea63bf 100644 --- a/include/linux/greybus.h +++ b/include/linux/greybus.h @@ -106,12 +106,12 @@ struct dentry *gb_debugfs_get(void); extern const struct bus_type greybus_bus_type; -extern struct device_type greybus_hd_type; -extern struct device_type greybus_module_type; -extern struct device_type greybus_interface_type; -extern struct device_type greybus_control_type; -extern struct device_type greybus_bundle_type; -extern struct device_type greybus_svc_type; +extern const struct device_type greybus_hd_type; +extern const struct device_type greybus_module_type; +extern const struct device_type greybus_interface_type; +extern const struct device_type greybus_control_type; +extern const struct device_type greybus_bundle_type; +extern const struct device_type greybus_svc_type; static inline int is_gb_host_device(const struct device *dev) { -- cgit v1.2.3 From f8c7511db009d42e2c24e48eeb04e3f1b67ab209 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 16:32:16 -0300 Subject: block: make block_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the block_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240305-class_cleanup-block-v1-1-130bb27b9c72@marliere.net Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..19c7596f4ebf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -42,7 +42,7 @@ struct blk_crypto_profile; extern const struct device_type disk_type; extern const struct device_type part_type; -extern struct class block_class; +extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. -- cgit v1.2.3 From 39714fd73c6b60a8d27bcc5b431afb0828bf4434 Mon Sep 17 00:00:00 2001 From: Ethan Zhao Date: Tue, 5 Mar 2024 20:21:14 +0800 Subject: PCI: Make pci_dev_is_disconnected() helper public for other drivers Make pci_dev_is_disconnected() public so that it can be called from Intel VT-d driver to quickly fix/workaround the surprise removal unplug hang issue for those ATS capable devices on PCIe switch downstream hotplug capable ports. Beside pci_device_is_present() function, this one has no config space space access, so is light enough to optimize the normal pure surprise removal and safe removal flow. Acked-by: Bjorn Helgaas Reviewed-by: Dan Carpenter Tested-by: Haorong Ye Signed-off-by: Ethan Zhao Link: https://lore.kernel.org/r/20240301080727.3529832-2-haifeng.zhao@linux.intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/pci.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 7ab0d13672da..213109d3c601 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2517,6 +2517,11 @@ static inline struct pci_dev *pcie_find_root_port(struct pci_dev *dev) return NULL; } +static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) +{ + return dev->error_state == pci_channel_io_perm_failure; +} + void pci_request_acs(void); bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags); bool pci_acs_path_enabled(struct pci_dev *start, -- cgit v1.2.3 From 0061ffe289e19caabeea8103e69cb0f1896e34d8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 5 Mar 2024 20:21:17 +0800 Subject: iommu: Add static iommu_ops->release_domain The current device_release callback for individual iommu drivers does the following: 1) Silent IOMMU DMA translation: It detaches any existing domain from the device and puts it into a blocking state (some drivers might use the identity state). 2) Resource release: It releases resources allocated during the device_probe callback and restores the device to its pre-probe state. Step 1 is challenging for individual iommu drivers because each must check if a domain is already attached to the device. Additionally, if a deferred attach never occurred, the device_release should avoid modifying hardware configuration regardless of the reason for its call. To simplify this process, introduce a static release_domain within the iommu_ops structure. It can be either a blocking or identity domain depending on the iommu hardware. The iommu core will decide whether to attach this domain before the device_release callback, eliminating the need for repetitive code in various drivers. Consequently, the device_release callback can focus solely on the opposite operations of device_probe, including releasing all resources allocated during that callback. Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240305013305.204605-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1ea2a820e1eb..a0a07e1680a2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -487,6 +487,7 @@ struct iommu_ops { struct module *owner; struct iommu_domain *identity_domain; struct iommu_domain *blocked_domain; + struct iommu_domain *release_domain; struct iommu_domain *default_domain; }; -- cgit v1.2.3 From dd27a84b06aa9ea6a94b0f3e59dc768f981962e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Mar 2024 07:01:50 -0700 Subject: block: remove disk_stack_limits disk_stack_limits is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed--by: Song Liu Tested-by: Song Liu Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240303140150.5435-12-hch@lst.de --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..75c909865a8b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -926,8 +926,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); -extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); -- cgit v1.2.3 From 6f42249fecb94dfb6514ed241475f748c03d62fb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 4 Mar 2024 19:13:42 -0500 Subject: tracing: Limit trace_seq size to just 8K and not depend on architecture PAGE_SIZE The trace_seq buffer is used to print out entire events. It's typically set to PAGE_SIZE * 2 as there's some events that can be quite large. As a side effect, writes to trace_marker is limited by both the size of the trace_seq buffer as well as the ring buffer's sub-buffer size (which is a power of PAGE_SIZE). By limiting the trace_seq size, it also limits the size of the largest string written to trace_marker. trace_seq does not need to be dependent on PAGE_SIZE like the ring buffer sub-buffers need to be. Hard code it to 8K which is PAGE_SIZE * 2 on most architectures. This will also limit the size of trace_marker on those architectures with greater than 4K PAGE_SIZE. Link: https://lore.kernel.org/all/20240302111244.3a1674be@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20240304191342.56fb1087@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Linus Torvalds Cc: Sachin Sant Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_seq.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 9ec229dfddaa..1ef95c0287f0 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -9,9 +9,15 @@ /* * Trace sequences are used to allow a function to call several other functions * to create a string of data to use. + * + * Have the trace seq to be 8K which is typically PAGE_SIZE * 2 on + * most architectures. The TRACE_SEQ_BUFFER_SIZE (which is + * TRACE_SEQ_SIZE minus the other fields of trace_seq), is the + * max size the output of a trace event may be. */ -#define TRACE_SEQ_BUFFER_SIZE (PAGE_SIZE * 2 - \ +#define TRACE_SEQ_SIZE 8192 +#define TRACE_SEQ_BUFFER_SIZE (TRACE_SEQ_SIZE - \ (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) struct trace_seq { -- cgit v1.2.3 From eb52286634f042432ec775077a73334603a1c6e4 Mon Sep 17 00:00:00 2001 From: "Gang Li Subject: padata: dispatch works on" Date: Wed, 6 Mar 2024 13:04:17 -0800 Subject: Author: Gang Li padata: dispatch works on different nodes Date: Thu, 22 Feb 2024 22:04:17 +0800 When a group of tasks that access different nodes are scheduled on the same node, they may encounter bandwidth bottlenecks and access latency. Thus, numa_aware flag is introduced here, allowing tasks to be distributed across different nodes to fully utilize the advantage of multi-node systems. Link: https://lkml.kernel.org/r/20240222140422.393911-5-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Reviewed-by: Tim Chen Cc: Alexey Dobriyan Cc: Daniel Jordan Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Steffen Klassert Signed-off-by: Andrew Morton --- include/linux/padata.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index 495b16b6b4d7..8f418711351b 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -137,6 +137,7 @@ struct padata_shell { * appropriate for one worker thread to do at once. * @max_threads: Max threads to use for the job, actual number may be less * depending on task size and minimum chunk size. + * @numa_aware: Distribute jobs to different nodes with CPU in a round robin fashion. */ struct padata_mt_job { void (*thread_fn)(unsigned long start, unsigned long end, void *arg); @@ -146,6 +147,7 @@ struct padata_mt_job { unsigned long align; unsigned long min_chunk; int max_threads; + bool numa_aware; }; /** -- cgit v1.2.3 From bd5ed02e23958cb56d0f5a90ebe620c8b47dab47 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:18 +0800 Subject: padata: downgrade padata_do_multithreaded to serial execution for non-SMP hugetlb parallelization depends on PADATA, and PADATA depends on SMP. PADATA consists of two distinct functionality: One part is padata_do_multithreaded which disregards order and simply divides tasks into several groups for parallel execution. Hugetlb init parallelization depends on padata_do_multithreaded. The other part is composed of a set of APIs that, while handling data in an out-of-order parallel manner, can eventually return the data with ordered sequence. Currently Only `crypto/pcrypt.c` use them. All users of PADATA of non-SMP case currently only use padata_do_multithreaded. It is easy to implement a serial one in include/linux/padata.h. And it is not necessary to implement another functionality unless the only user of crypto/pcrypt.c does not depend on SMP in the future. Link: https://lkml.kernel.org/r/20240222140422.393911-6-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: Paul E. McKenney Acked-by: Daniel Jordan Cc: David Hildenbrand Cc: David Rientjes Cc: Jane Chu Cc: Muchun Song Cc: Randy Dunlap Cc: Steffen Klassert Cc: Tim Chen Cc: Alexey Dobriyan Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/padata.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/padata.h b/include/linux/padata.h index 8f418711351b..0146daf34430 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -180,10 +180,6 @@ struct padata_instance { #ifdef CONFIG_PADATA extern void __init padata_init(void); -#else -static inline void __init padata_init(void) {} -#endif - extern struct padata_instance *padata_alloc(const char *name); extern void padata_free(struct padata_instance *pinst); extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); @@ -194,4 +190,12 @@ extern void padata_do_serial(struct padata_priv *padata); extern void __init padata_do_multithreaded(struct padata_mt_job *job); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); +#else +static inline void __init padata_init(void) {} +static inline void __init padata_do_multithreaded(struct padata_mt_job *job) +{ + job->thread_fn(job->start, job->start + job->size, job->fn_arg); +} +#endif + #endif -- cgit v1.2.3 From b78b27d02930f6f0262353080d0f784ce7aa377e Mon Sep 17 00:00:00 2001 From: Gang Li Date: Thu, 22 Feb 2024 22:04:21 +0800 Subject: hugetlb: parallelize 1G hugetlb initialization Optimizing the initialization speed of 1G huge pages through parallelization. 1G hugetlbs are allocated from bootmem, a process that is already very fast and does not currently require optimization. Therefore, we focus on parallelizing only the initialization phase in `gather_bootmem_prealloc`. Here are some test results: test case no patch(ms) patched(ms) saved ------------------- -------------- ------------- -------- 256c2T(4 node) 1G 4745 2024 57.34% 128c1T(2 node) 1G 3358 1712 49.02% 12T 1G 77000 18300 76.23% [akpm@linux-foundation.org: s/initialied/initialized/, per Alexey] Link: https://lkml.kernel.org/r/20240222140422.393911-9-gang.li@linux.dev Signed-off-by: Gang Li Tested-by: David Rientjes Reviewed-by: Muchun Song Cc: Alexey Dobriyan Cc: Daniel Jordan Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Steffen Klassert Cc: Tim Chen Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c1ee640d87b1..77b30a8c6076 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); extern int sysctl_hugetlb_shm_group; -extern struct list_head huge_boot_pages; +extern struct list_head huge_boot_pages[MAX_NUMNODES]; /* arch callbacks */ -- cgit v1.2.3 From dfbac6dc68bae989bd68a56947dcca16c5574fda Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:28 +0000 Subject: mm: separate out FOLIO_FLAGS from PAGEFLAGS Patch series "PageFlags cleanups". We have now successfully removed all of the uses of some of the PageFlags from the kernel, but there's nothing to stop somebody reintroducing them. By splitting out FOLIO_FLAGS from PAGEFLAGS, we can stop defining the old flags; and we do that in some of the later patches. After doing this, I realised that dump_page() was living dangerously; we could end up calling folio_test_foo() on a pointer which no longer pointed to a folio (as dump_page() is not necessarily called when the caller has a reference to the page). So I fixed that up. And then I realised that this was the key to making dump_page() take a const argument, which means we can constify the page flags testing, which means we can remove more cast-away-the-const bad code. And here's where I ended up. This patch (of 8): We've progressed far enough with the folio transition that some flags are now no longer checked on pages, but only on folios. To prevent new users appearing, prepare to only define the folio versions of the flag test/set/clear. Link: https://lkml.kernel.org/r/20240227192337.757313-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240227192337.757313-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 63 +++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 735cddc13d20..95ab75d0b39c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -367,54 +367,77 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 +#define FOLIO_HEAD_PAGE 0 +#define FOLIO_SECOND_PAGE 1 + /* * Macros to create function definitions for page flags */ +#define FOLIO_TEST_FLAG(name, page) \ +static __always_inline bool folio_test_##name(struct folio *folio) \ +{ return test_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_SET_FLAG(name, page) \ +static __always_inline void folio_set_##name(struct folio *folio) \ +{ set_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_CLEAR_FLAG(name, page) \ +static __always_inline void folio_clear_##name(struct folio *folio) \ +{ clear_bit(PG_##name, folio_flags(folio, page)); } + +#define __FOLIO_SET_FLAG(name, page) \ +static __always_inline void __folio_set_##name(struct folio *folio) \ +{ __set_bit(PG_##name, folio_flags(folio, page)); } + +#define __FOLIO_CLEAR_FLAG(name, page) \ +static __always_inline void __folio_clear_##name(struct folio *folio) \ +{ __clear_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_TEST_SET_FLAG(name, page) \ +static __always_inline bool folio_test_set_##name(struct folio *folio) \ +{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_TEST_CLEAR_FLAG(name, page) \ +static __always_inline bool folio_test_clear_##name(struct folio *folio) \ +{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_FLAG(name, page) \ +FOLIO_TEST_FLAG(name, page) \ +FOLIO_SET_FLAG(name, page) \ +FOLIO_CLEAR_FLAG(name, page) + #define TESTPAGEFLAG(uname, lname, policy) \ -static __always_inline bool folio_test_##lname(struct folio *folio) \ -{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void folio_set_##lname(struct folio *folio) \ -{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void folio_clear_##lname(struct folio *folio) \ -{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void __folio_set_##lname(struct folio *folio) \ -{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +__FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void __folio_clear_##lname(struct folio *folio) \ -{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ -static __always_inline \ -bool folio_test_set_##lname(struct folio *folio) \ -{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ -static __always_inline \ -bool folio_test_clear_##lname(struct folio *folio) \ -{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } -- cgit v1.2.3 From 0d846469fd216d37a91845945e9baad11dfa107b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:29 +0000 Subject: mm: remove PageWaiters, PageSetWaiters and PageClearWaiters All callers have been converted to use folios. This was the only user of PF_ONLY_HEAD, so remove that too. Link: https://lkml.kernel.org/r/20240227192337.757313-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 95ab75d0b39c..d8f5127ae72e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -328,9 +328,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) * for compound page all operations related to the page flag applied to * head page. * - * PF_ONLY_HEAD: - * for compound page, callers only ever operate on the head page. - * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. @@ -346,9 +343,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) -#define PF_ONLY_HEAD(page, enforce) ({ \ - VM_BUG_ON_PGFLAGS(PageTail(page), page); \ - PF_POISONED_CHECK(page); }) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) @@ -362,7 +356,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 -#define FOLIO_PF_ONLY_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 @@ -488,7 +481,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; } TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) -PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) +FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) PAGEFLAG(Referenced, referenced, PF_HEAD) TESTCLEARFLAG(Referenced, referenced, PF_HEAD) @@ -1138,7 +1131,6 @@ static inline bool folio_has_private(struct folio *folio) #undef PF_ANY #undef PF_HEAD -#undef PF_ONLY_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND -- cgit v1.2.3 From 7da8988c7c0e28dad8d0e9a697d6e7baa66f4534 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:30 +0000 Subject: mm: remove PageYoung and PageIdle definitions All callers have been converted to use folios, so remove the various set/clear/test functions defined on pages. Link: https://lkml.kernel.org/r/20240227192337.757313-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d8f5127ae72e..582ca7400eca 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -599,10 +599,10 @@ PAGEFLAG_FALSE(HWPoison, hwpoison) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) -TESTPAGEFLAG(Young, young, PF_ANY) -SETPAGEFLAG(Young, young, PF_ANY) -TESTCLEARFLAG(Young, young, PF_ANY) -PAGEFLAG(Idle, idle, PF_ANY) +FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE) +FOLIO_FLAG(idle, FOLIO_HEAD_PAGE) #endif /* -- cgit v1.2.3 From fae7d834c43ccdb9fcecaf4d0f33145d884b3e5c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:31 +0000 Subject: mm: add __dump_folio() Turn __dump_page() into a wrapper around __dump_folio(). Snapshot the page & folio into a stack variable so we don't hit BUG_ON() if an allocation is freed under us and what was a folio pointer becomes a pointer to a tail page. [willy@infradead.org: fix build issue] Link: https://lkml.kernel.org/r/ZeAKCyTn_xS3O9cE@casper.infradead.org [willy@infradead.org: fix __dump_folio] Link: https://lkml.kernel.org/r/ZeJJegP8zM7S9GTy@casper.infradead.org [willy@infradead.org: fix pointer confusion] Link: https://lkml.kernel.org/r/ZeYa00ixxC4k1ot-@casper.infradead.org [akpm@linux-foundation.org: s/printk/pr_warn/] Link: https://lkml.kernel.org/r/20240227192337.757313-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +++++++ include/linux/mmzone.h | 3 +++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d45eadc440f5..02547c8adda0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2066,6 +2066,13 @@ static inline long folio_nr_pages(struct folio *folio) #endif } +/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#else +#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#endif + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 633812a1d220..c11b7cde81ef 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -76,9 +76,12 @@ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) +# define is_migrate_cma_folio(folio, pfn) (MIGRATE_CMA == \ + get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK)) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false +# define is_migrate_cma_folio(folio, pfn) false #endif static inline bool is_migrate_movable(int mt) -- cgit v1.2.3 From b3a3203309c89061452250f7384507787b7badcb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:32 +0000 Subject: mm: make dump_page() take a const argument Now that __dump_page() takes a const argument, we can make dump_page() take a const struct page too. Link: https://lkml.kernel.org/r/20240227192337.757313-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 7c3e7b0b0e8f..39a7714605a7 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -10,7 +10,7 @@ struct vm_area_struct; struct mm_struct; struct vma_iterator; -void dump_page(struct page *page, const char *reason); +void dump_page(const struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); void vma_iter_dump_tree(const struct vma_iterator *vmi); -- cgit v1.2.3 From ce3467af6bded1c0018ca67ea1599f45fbb8100b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:33 +0000 Subject: mm: constify testing page/folio flags Now that dump_page() takes a const argument, we can constify all the page flag tests. Link: https://lkml.kernel.org/r/20240227192337.757313-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 582ca7400eca..3463cd1baebf 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -237,7 +237,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) } #endif -static __always_inline int page_is_fake_head(struct page *page) +static __always_inline int page_is_fake_head(const struct page *page) { return page_fixed_fake_head(page) != page; } @@ -281,12 +281,12 @@ static inline unsigned long _compound_head(const struct page *page) */ #define folio_page(folio, n) nth_page(&(folio)->page, n) -static __always_inline int PageTail(struct page *page) +static __always_inline int PageTail(const struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } -static __always_inline int PageCompound(struct page *page) +static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags) || READ_ONCE(page->compound_head) & 1; @@ -306,6 +306,16 @@ static inline void page_init_poison(struct page *page, size_t size) } #endif +static const unsigned long *const_folio_flags(const struct folio *folio, + unsigned n) +{ + const struct page *page = &folio->page; + + VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); + return &page[n].flags; +} + static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; @@ -367,8 +377,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) * Macros to create function definitions for page flags */ #define FOLIO_TEST_FLAG(name, page) \ -static __always_inline bool folio_test_##name(struct folio *folio) \ -{ return test_bit(PG_##name, folio_flags(folio, page)); } +static __always_inline bool folio_test_##name(const struct folio *folio) \ +{ return test_bit(PG_##name, const_folio_flags(folio, page)); } #define FOLIO_SET_FLAG(name, page) \ static __always_inline void folio_set_##name(struct folio *folio) \ @@ -401,7 +411,7 @@ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ -static __always_inline int Page##uname(struct page *page) \ +static __always_inline int Page##uname(const struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ @@ -801,7 +811,7 @@ static __always_inline bool folio_test_head(struct folio *folio) return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); } -static __always_inline int PageHead(struct page *page) +static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); -- cgit v1.2.3 From 29cfe7556bfd6be043b6eb602a29c89d43565d71 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:34 +0000 Subject: mm: constify more page/folio tests Constify the flag tests that aren't automatically generated and the tests that look like flag tests but are more complicated. Link: https://lkml.kernel.org/r/20240227192337.757313-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 52 +++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3463cd1baebf..652d77805e99 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -558,13 +558,13 @@ PAGEFLAG_FALSE(HighMem, highmem) #endif #ifdef CONFIG_SWAP -static __always_inline bool folio_test_swapcache(struct folio *folio) +static __always_inline bool folio_test_swapcache(const struct folio *folio) { return folio_test_swapbacked(folio) && - test_bit(PG_swapcache, folio_flags(folio, 0)); + test_bit(PG_swapcache, const_folio_flags(folio, 0)); } -static __always_inline bool PageSwapCache(struct page *page) +static __always_inline bool PageSwapCache(const struct page *page) { return folio_test_swapcache(page_folio(page)); } @@ -663,22 +663,22 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) */ #define PAGE_MAPPING_DAX_SHARED ((void *)0x1) -static __always_inline bool folio_mapping_flags(struct folio *folio) +static __always_inline bool folio_mapping_flags(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline int PageMappingFlags(struct page *page) +static __always_inline int PageMappingFlags(const struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline bool folio_test_anon(struct folio *folio) +static __always_inline bool folio_test_anon(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; } -static __always_inline bool PageAnon(struct page *page) +static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); } @@ -689,7 +689,7 @@ static __always_inline bool __folio_test_movable(const struct folio *folio) PAGE_MAPPING_MOVABLE; } -static __always_inline int __PageMovable(struct page *page) +static __always_inline int __PageMovable(const struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_MOVABLE; @@ -702,13 +702,13 @@ static __always_inline int __PageMovable(struct page *page) * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ -static __always_inline bool folio_test_ksm(struct folio *folio) +static __always_inline bool folio_test_ksm(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } -static __always_inline bool PageKsm(struct page *page) +static __always_inline bool PageKsm(const struct page *page) { return folio_test_ksm(page_folio(page)); } @@ -747,9 +747,9 @@ static inline bool folio_xor_flags_has_waiters(struct folio *folio, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ -static inline bool folio_test_uptodate(struct folio *folio) +static inline bool folio_test_uptodate(const struct folio *folio) { - bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); + bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. @@ -764,7 +764,7 @@ static inline bool folio_test_uptodate(struct folio *folio) return ret; } -static inline int PageUptodate(struct page *page) +static inline int PageUptodate(const struct page *page) { return folio_test_uptodate(page_folio(page)); } @@ -806,9 +806,9 @@ void set_page_writeback(struct page *page); #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) -static __always_inline bool folio_test_head(struct folio *folio) +static __always_inline bool folio_test_head(const struct folio *folio) { - return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); + return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY)); } static __always_inline int PageHead(const struct page *page) @@ -827,7 +827,7 @@ CLEARPAGEFLAG(Head, head, PF_ANY) * * Return: True if the folio is larger than one page. */ -static inline bool folio_test_large(struct folio *folio) +static inline bool folio_test_large(const struct folio *folio) { return folio_test_head(folio); } @@ -856,7 +856,7 @@ TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_HUGETLB_PAGE -int PageHuge(struct page *page); +int PageHuge(const struct page *page); SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) @@ -869,10 +869,10 @@ CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) * Return: True for hugetlbfs folios, false for anon folios or folios * belonging to other filesystems. */ -static inline bool folio_test_hugetlb(struct folio *folio) +static inline bool folio_test_hugetlb(const struct folio *folio) { return folio_test_large(folio) && - test_bit(PG_hugetlb, folio_flags(folio, 1)); + test_bit(PG_hugetlb, const_folio_flags(folio, 1)); } #else TESTPAGEFLAG_FALSE(Huge, hugetlb) @@ -887,7 +887,7 @@ TESTPAGEFLAG_FALSE(Huge, hugetlb) * hugetlbfs pages, but not normal pages. PageTransHuge() can only be * called only in the core VM paths where hugetlbfs pages can't exist. */ -static inline int PageTransHuge(struct page *page) +static inline int PageTransHuge(const struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); return PageHead(page); @@ -898,7 +898,7 @@ static inline int PageTransHuge(struct page *page) * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ -static inline int PageTransCompound(struct page *page) +static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } @@ -908,7 +908,7 @@ static inline int PageTransCompound(struct page *page) * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ -static inline int PageTransTail(struct page *page) +static inline int PageTransTail(const struct page *page) { return PageTail(page); } @@ -972,7 +972,7 @@ static inline int page_type_has_type(unsigned int page_type) return (int)page_type < PAGE_MAPCOUNT_RESERVE; } -static inline int page_has_type(struct page *page) +static inline int page_has_type(const struct page *page) { return page_type_has_type(page->page_type); } @@ -1056,7 +1056,7 @@ extern bool is_free_buddy_page(struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); -static __always_inline int PageAnonExclusive(struct page *page) +static __always_inline int PageAnonExclusive(const struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); @@ -1129,12 +1129,12 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * Determine if a page has private stuff, indicating that release routines * should be invoked upon it. */ -static inline int page_has_private(struct page *page) +static inline int page_has_private(const struct page *page) { return !!(page->flags & PAGE_FLAGS_PRIVATE); } -static inline bool folio_has_private(struct folio *folio) +static inline bool folio_has_private(const struct folio *folio) { return page_has_private(&folio->page); } -- cgit v1.2.3 From 9164448d3100d5118bda5e9d38b69a9f32cea509 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 19:23:35 +0000 Subject: mm: remove cast from page_to_nid() Now that PF_POISONED_CHECK() can take a const argument, we can drop the cast. Link: https://lkml.kernel.org/r/20240227192337.757313-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 02547c8adda0..699e850d143c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1641,13 +1641,11 @@ static inline int page_zone_id(struct page *page) } #ifdef NODE_NOT_IN_PAGE_FLAGS -extern int page_to_nid(const struct page *page); +int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - struct page *p = (struct page *)page; - - return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif -- cgit v1.2.3 From 22beb471b46a1a408720498f7895232edab559d1 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 4 Mar 2024 19:07:18 +0800 Subject: mm: pgtable: correct the wrong comment about ptdesc->__page_flags Patch series "minor fixes and supplement for ptdesc". In this series, the [PATCH 1/3] and [PATCH 2/3] are fixes for some issues discovered during code inspection. The [PATCH 3/3] is a supplement to ptdesc conversion in s390, I don't know why this is not done in the commit 6326c26c1514 ("s390: convert various pgalloc functions to use ptdescs"), maybe I missed something. And since I don't have an s390 environment, I hope kernel test robot can help compile and test, and this is why I did not fold [PATCH 2/3] and [PATCH 3/3] into one patch. This patch (of 3): The commit 32cc0b7c9d50 ("powerpc: add pte_free_defer() for pgtables sharing page") introduced the use of PageActive flag to page table fragments tracking, so the ptdesc->__page_flags is not unused, so correct the wrong comment. Link: https://lkml.kernel.org/r/cover.1709541697.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/cc42d5915fd98fd802f920de243f535efcfe01db.1709541697.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Vishal Moola (Oracle) Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: Janosch Frank Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a7223ba3ea1e..5ea77969daae 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -419,7 +419,7 @@ FOLIO_MATCH(compound_head, _head_2a); /** * struct ptdesc - Memory descriptor for page tables. - * @__page_flags: Same as page flags. Unused for page tables. + * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 and x86. * @_pt_pad_1: Padding that aliases with page's compound head. -- cgit v1.2.3 From ea919671517a46b75f975fcf126e08ccf7e9c09f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 4 Mar 2024 19:07:19 +0800 Subject: mm: pgtable: add missing pt_index to struct ptdesc In s390, the page->index field is used for gmap (see gmap_shadow_pgt()), so add the corresponding pt_index to struct ptdesc and add a comment to clarify this. Link: https://lkml.kernel.org/r/283624c2af45fb2090b41a6b1b5481bb0a45bad7.1709541697.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Hugh Dickins Cc: Janosch Frank Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5ea77969daae..5240bd7bca33 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -425,6 +425,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. + * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. @@ -450,6 +451,7 @@ struct ptdesc { unsigned long __page_mapping; union { + pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; }; @@ -475,6 +477,7 @@ TABLE_MATCH(flags, __page_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); +TABLE_MATCH(index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, __page_refcount); -- cgit v1.2.3 From c05995b7ec2a73bf813a8944978e175f8e4ec3ac Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:50 +0800 Subject: mm/treewide: align up pXd_leaf() retval across archs Even if pXd_leaf() API is defined globally, it's not clear on the retval, and there are three types used (bool, int, unsigned log). Always return a boolean for pXd_leaf() APIs. Link: https://lkml.kernel.org/r/20240305043750.93762-11-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a36cf4e124b0..85fc7554cd52 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1777,16 +1777,16 @@ typedef unsigned int pgtbl_mod_mask; * Only meaningful when called on a valid entry. */ #ifndef pgd_leaf -#define pgd_leaf(x) 0 +#define pgd_leaf(x) false #endif #ifndef p4d_leaf -#define p4d_leaf(x) 0 +#define p4d_leaf(x) false #endif #ifndef pud_leaf -#define pud_leaf(x) 0 +#define pud_leaf(x) false #endif #ifndef pmd_leaf -#define pmd_leaf(x) 0 +#define pmd_leaf(x) false #endif #ifndef pgd_leaf_size -- cgit v1.2.3 From e6f798225a31485e47a6e4f6aa07ee9fdf80c2cb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 4 Mar 2024 19:05:16 -0800 Subject: mm: Introduce VM_SPARSE kind and vm_area_[un]map_pages(). vmap/vmalloc APIs are used to map a set of pages into contiguous kernel virtual space. get_vm_area() with appropriate flag is used to request an area of kernel address range. It's used for vmalloc, vmap, ioremap, xen use cases. - vmalloc use case dominates the usage. Such vm areas have VM_ALLOC flag. - the areas created by vmap() function should be tagged with VM_MAP. - ioremap areas are tagged with VM_IOREMAP. BPF would like to extend the vmap API to implement a lazily-populated sparse, yet contiguous kernel virtual space. Introduce VM_SPARSE flag and vm_area_map_pages(area, start_addr, count, pages) API to map a set of pages within a given area. It has the same sanity checks as vmap() does. It also checks that get_vm_area() was created with VM_SPARSE flag which identifies such areas in /proc/vmallocinfo and returns zero pages on read through /proc/kcore. The next commits will introduce bpf_arena which is a sparsely populated shared memory region between bpf program and user space process. It will map privately-managed pages into a sparse vm area with the following steps: // request virtual memory region during bpf prog verification area = get_vm_area(area_size, VM_SPARSE); // on demand vm_area_map_pages(area, kaddr, kend, pages); vm_area_unmap_pages(area, kaddr, kend); // after bpf program is detached and unloaded free_vm_area(area); Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Reviewed-by: Christoph Hellwig Reviewed-by: Pasha Tatashin Link: https://lore.kernel.org/bpf/20240305030516.41519-3-alexei.starovoitov@gmail.com --- include/linux/vmalloc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c8dd..0f72c85a377b 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,6 +35,7 @@ struct iov_iter; /* in uio.h */ #else #define VM_DEFER_KMEMLEAK 0 #endif +#define VM_SPARSE 0x00001000 /* sparse vm_area. not all pages are present. */ /* bits [20..32] reserved for arch specific ioremap internals */ @@ -232,6 +233,10 @@ static inline bool is_vm_area_hugepages(const void *addr) } #ifdef CONFIG_MMU +int vm_area_map_pages(struct vm_struct *area, unsigned long start, + unsigned long end, struct page **pages); +void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, + unsigned long end); void vunmap_range(unsigned long addr, unsigned long end); static inline void set_vm_flush_reset_perms(void *addr) { -- cgit v1.2.3 From 011832b97b311bb9e3c27945bc0d1089a14209c9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 5 Mar 2024 19:19:26 -0800 Subject: bpf: Introduce may_goto instruction Introduce may_goto instruction that from the verifier pov is similar to open coded iterators bpf_for()/bpf_repeat() and bpf_loop() helper, but it doesn't iterate any objects. In assembly 'may_goto' is a nop most of the time until bpf runtime has to terminate the program for whatever reason. In the current implementation may_goto has a hidden counter, but other mechanisms can be used. For programs written in C the later patch introduces 'cond_break' macro that combines 'may_goto' with 'break' statement and has similar semantics: cond_break is a nop until bpf runtime has to break out of this loop. It can be used in any normal "for" or "while" loop, like for (i = zero; i < cnt; cond_break, i++) { The verifier recognizes that may_goto is used in the program, reserves additional 8 bytes of stack, initializes them in subprog prologue, and replaces may_goto instruction with: aux_reg = *(u64 *)(fp - 40) if aux_reg == 0 goto pc+off aux_reg -= 1 *(u64 *)(fp - 40) = aux_reg may_goto instruction can be used by LLVM to implement __builtin_memcpy, __builtin_strcmp. may_goto is not a full substitute for bpf_for() macro. bpf_for() doesn't have induction variable that verifiers sees, so 'i' in bpf_for(i, 0, 100) is seen as imprecise and bounded. But when the code is written as: for (i = 0; i < 100; cond_break, i++) the verifier see 'i' as precise constant zero, hence cond_break (aka may_goto) doesn't help to converge the loop. A static or global variable can be used as a workaround: static int zero = 0; for (i = zero; i < 100; cond_break, i++) // works! may_goto works well with arena pointers that don't need to be bounds checked on access. Load/store from arena returns imprecise unbounded scalar and loops with may_goto pass the verifier. Reserve new opcode BPF_JMP | BPF_JCOND for may_goto insn. JCOND stands for conditional pseudo jump. Since goto_or_nop insn was proposed, it may use the same opcode. may_goto vs goto_or_nop can be distinguished by src_reg: code = BPF_JMP | BPF_JCOND src_reg = 0 - may_goto src_reg = 1 - goto_or_nop Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Acked-by: John Fastabend Tested-by: John Fastabend Link: https://lore.kernel.org/bpf/20240306031929.42666-2-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 84365e6dd85d..4b0f6600e499 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -449,6 +449,7 @@ struct bpf_verifier_state { u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; + u32 may_goto_depth; }; #define bpf_get_spilled_reg(slot, frame, mask) \ @@ -619,6 +620,7 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ + u16 stack_extra; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; -- cgit v1.2.3 From f311507c5336ad6b9ca7687c35c3bd7a4fe2868c Mon Sep 17 00:00:00 2001 From: "Hsin-Yu.Chen" Date: Wed, 6 Mar 2024 12:19:00 +0800 Subject: i2c: remove redundant condition I2C_M_RD is defined as and guaranteed to be 1 and 'flag & I2C_M_RD' is one or zero. No need for an additional condition to obtain the value. Signed-off-by: Hsin-Yu.Chen Reviewed-by: Andi Shyti [wsa: slightly updated commit message] Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index ff93ff8b257c..5e6cd43a6dbd 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -931,7 +931,7 @@ static inline int i2c_adapter_id(struct i2c_adapter *adap) static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg) { - return (msg->addr << 1) | (msg->flags & I2C_M_RD ? 1 : 0); + return (msg->addr << 1) | (msg->flags & I2C_M_RD); } u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold); -- cgit v1.2.3 From a0873a5d542559698edfd4c8fc6e6636d338eea2 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 14 Feb 2024 19:08:00 -0800 Subject: net/mlx5: Add MPIR bit in mcam_access_reg Add a cap bit in mcam_access_reg to check for MPIR support. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 628a3aa7a7e0..2756bdb654b4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10253,7 +10253,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mcqi[0x1]; u8 mcqs[0x1]; - u8 regs_95_to_87[0x9]; + u8 regs_95_to_90[0x6]; + u8 mpir[0x1]; + u8 regs_88_to_87[0x2]; u8 mpegc[0x1]; u8 mtutc[0x1]; u8 regs_84_to_68[0x11]; -- cgit v1.2.3 From ed29705e4ed1d5c1b2184fecc4684bd56c5d24ee Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 14 Feb 2024 19:08:13 -0800 Subject: net/mlx5: Enable SD feature Have an actual mlx5_sd instance in the core device, and fix the getter accordingly. This allows SD stuff to flow, the feature becomes supported only here. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 41f03b352401..bf9324a31ae9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -823,6 +823,7 @@ struct mlx5_core_dev { struct blocking_notifier_head macsec_nh; #endif u64 num_ipsec_offloads; + struct mlx5_sd *sd; }; struct mlx5_db { -- cgit v1.2.3 From 1368d06dd2c99186174290c03d79c132db16efe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Thu, 25 Jan 2024 16:30:53 +0100 Subject: leds: Introduce ExpressWire library MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ExpressWire protocol is shared between at least KTD2692 and KTD2801 with slight differences such as timings and the former not having a defined set of pulses for enabling the protocol (possibly because it does not support PWM unlike KTD2801). Despite these differences the ExpressWire handling code can be shared between the two, so in preparation for adding KTD2801 support introduce a library implementing this protocol. Suggested-by: Daniel Thompson Reviewed-by: Linus Walleij Reviewed-by: Daniel Thompson Signed-off-by: Duje Mihanović Link: https://lore.kernel.org/r/20240125-ktd2801-v5-1-e22da232a825@skole.hr Signed-off-by: Lee Jones --- include/linux/leds-expresswire.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/linux/leds-expresswire.h (limited to 'include/linux') diff --git a/include/linux/leds-expresswire.h b/include/linux/leds-expresswire.h new file mode 100644 index 000000000000..3c61902ccac8 --- /dev/null +++ b/include/linux/leds-expresswire.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Shared library for Kinetic's ExpressWire protocol. + * This protocol works by pulsing the ExpressWire IC's control GPIO. + * ktd2692 and ktd2801 are known to use this protocol. + */ + +#ifndef _LEDS_EXPRESSWIRE_H +#define _LEDS_EXPRESSWIRE_H + +#include + +struct expresswire_timing { + unsigned long poweroff_us; + unsigned long detect_delay_us; + unsigned long detect_us; + unsigned long data_start_us; + unsigned long end_of_data_low_us; + unsigned long end_of_data_high_us; + unsigned long short_bitset_us; + unsigned long long_bitset_us; +}; + +struct expresswire_common_props { + struct gpio_desc *ctrl_gpio; + struct expresswire_timing timing; +}; + +void expresswire_power_off(struct expresswire_common_props *props); +void expresswire_enable(struct expresswire_common_props *props); +void expresswire_start(struct expresswire_common_props *props); +void expresswire_end(struct expresswire_common_props *props); +void expresswire_set_bit(struct expresswire_common_props *props, bool bit); +void expresswire_write_u8(struct expresswire_common_props *props, u8 val); + +#endif /* _LEDS_EXPRESSWIRE_H */ -- cgit v1.2.3 From 7774f3d1dd3822e938e236df67766436c0debd11 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 23 Feb 2024 22:30:00 +0200 Subject: leds: expresswire: Don't use "proxy" headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Reviewed-by: Duje Mihanović Link: https://lore.kernel.org/r/20240223203010.881065-1-andriy.shevchenko@linux.intel.com Signed-off-by: Lee Jones --- include/linux/leds-expresswire.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/leds-expresswire.h b/include/linux/leds-expresswire.h index 3c61902ccac8..a422921f4159 100644 --- a/include/linux/leds-expresswire.h +++ b/include/linux/leds-expresswire.h @@ -8,7 +8,9 @@ #ifndef _LEDS_EXPRESSWIRE_H #define _LEDS_EXPRESSWIRE_H -#include +#include + +struct gpio_desc; struct expresswire_timing { unsigned long poweroff_us; -- cgit v1.2.3 From 211f8ec9400b58fb97cf4b6bd7033781e889bf53 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Jan 2024 10:06:39 +0100 Subject: leds: Remove led_init_default_state_get() and devm_led_classdev_register_ext() stubs These two functions have stub implementations that are called when NEW_LEDS and/or LEDS_CLASS are disabled, theorerically allowing drivers to optionally use the LED subsystem. However, this has never really worked because a built-in driver is unable to link against these functions if the LED class is in a loadable module. Heiner ran into this problem with a driver that newly gained a LEDS_CLASS dependency and suggested using an IS_REACHABLE() check. This is the reverse approach, removing the stub entirely to acknowledge that it is pointless in its current form, and that not having it avoids misleading developers into thinking that they can rely on it. This survived around 1000 randconfig builds to validate that any callers of the interface already have the correct Kconfig dependency already, with the exception of the one that Heiner just added. Cc: Heiner Kallweit Link: https://lore.kernel.org/linux-leds/0f6f432b-c650-4bb8-a1b5-fe3372804d52@gmail.com/T/#u Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240109090715.982332-1-arnd@kernel.org Signed-off-by: Lee Jones --- include/linux/leds.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 4754b02d3a2c..7598d472903a 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -82,15 +82,7 @@ struct led_init_data { bool devname_mandatory; }; -#if IS_ENABLED(CONFIG_NEW_LEDS) enum led_default_state led_init_default_state_get(struct fwnode_handle *fwnode); -#else -static inline enum led_default_state -led_init_default_state_get(struct fwnode_handle *fwnode) -{ - return LEDS_DEFSTATE_OFF; -} -#endif struct led_hw_trigger_type { int dummy; @@ -279,20 +271,9 @@ static inline int led_classdev_register(struct device *parent, return led_classdev_register_ext(parent, led_cdev, NULL); } -#if IS_ENABLED(CONFIG_LEDS_CLASS) int devm_led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data); -#else -static inline int -devm_led_classdev_register_ext(struct device *parent, - struct led_classdev *led_cdev, - struct led_init_data *init_data) -{ - return 0; -} -#endif - static inline int devm_led_classdev_register(struct device *parent, struct led_classdev *led_cdev) { -- cgit v1.2.3 From 09e3f3244e8480d53873bb86a3808edaa3f4e314 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Jan 2024 10:06:40 +0100 Subject: leds: Make flash and multicolor dependencies unconditional Along the same lines as making devm_led_classdev_register() declared extern unconditional, do the same thing for the two sub-classes that have similar stubs. The users of these interfaces go to great lengths to allow building with both the generic leds API and the extended version, but realistically there is not much use in this, so just simplify it to always rely on it and remove the confusing fallback logic. Signed-off-by: Arnd Bergmann Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240109090715.982332-2-arnd@kernel.org Signed-off-by: Lee Jones --- include/linux/led-class-flash.h | 24 ------------------------ include/linux/led-class-multicolor.h | 29 ----------------------------- 2 files changed, 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h index 612b4cab3819..36df927ec4b7 100644 --- a/include/linux/led-class-flash.h +++ b/include/linux/led-class-flash.h @@ -85,7 +85,6 @@ static inline struct led_classdev_flash *lcdev_to_flcdev( return container_of(lcdev, struct led_classdev_flash, led_cdev); } -#if IS_ENABLED(CONFIG_LEDS_CLASS_FLASH) /** * led_classdev_flash_register_ext - register a new object of LED class with * init data and with support for flash LEDs @@ -116,29 +115,6 @@ int devm_led_classdev_flash_register_ext(struct device *parent, void devm_led_classdev_flash_unregister(struct device *parent, struct led_classdev_flash *fled_cdev); -#else - -static inline int led_classdev_flash_register_ext(struct device *parent, - struct led_classdev_flash *fled_cdev, - struct led_init_data *init_data) -{ - return 0; -} - -static inline void led_classdev_flash_unregister(struct led_classdev_flash *fled_cdev) {}; -static inline int devm_led_classdev_flash_register_ext(struct device *parent, - struct led_classdev_flash *fled_cdev, - struct led_init_data *init_data) -{ - return 0; -} - -static inline void devm_led_classdev_flash_unregister(struct device *parent, - struct led_classdev_flash *fled_cdev) -{}; - -#endif /* IS_ENABLED(CONFIG_LEDS_CLASS_FLASH) */ - static inline int led_classdev_flash_register(struct device *parent, struct led_classdev_flash *fled_cdev) { diff --git a/include/linux/led-class-multicolor.h b/include/linux/led-class-multicolor.h index 210d57bcd767..db9f34c6736e 100644 --- a/include/linux/led-class-multicolor.h +++ b/include/linux/led-class-multicolor.h @@ -30,7 +30,6 @@ static inline struct led_classdev_mc *lcdev_to_mccdev( return container_of(led_cdev, struct led_classdev_mc, led_cdev); } -#if IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) /** * led_classdev_multicolor_register_ext - register a new object of led_classdev * class with support for multicolor LEDs @@ -64,34 +63,6 @@ int devm_led_classdev_multicolor_register_ext(struct device *parent, void devm_led_classdev_multicolor_unregister(struct device *parent, struct led_classdev_mc *mcled_cdev); -#else - -static inline int led_classdev_multicolor_register_ext(struct device *parent, - struct led_classdev_mc *mcled_cdev, - struct led_init_data *init_data) -{ - return 0; -} - -static inline void led_classdev_multicolor_unregister(struct led_classdev_mc *mcled_cdev) {}; -static inline int led_mc_calc_color_components(struct led_classdev_mc *mcled_cdev, - enum led_brightness brightness) -{ - return 0; -} - -static inline int devm_led_classdev_multicolor_register_ext(struct device *parent, - struct led_classdev_mc *mcled_cdev, - struct led_init_data *init_data) -{ - return 0; -} - -static inline void devm_led_classdev_multicolor_unregister(struct device *parent, - struct led_classdev_mc *mcled_cdev) -{}; - -#endif /* IS_ENABLED(CONFIG_LEDS_CLASS_MULTICOLOR) */ static inline int led_classdev_multicolor_register(struct device *parent, struct led_classdev_mc *mcled_cdev) -- cgit v1.2.3 From 08b7dab9f025e20dc02bb4ad19b8e519c3948d20 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Feb 2024 10:38:26 +0100 Subject: leds: Fix ifdef check for gpio_led_register_device() gpio_led_register_device() is built whenever CONFIG_LEDS_GPIO_REGISTER is enabled, and this may be used even when CONFIG_NEW_LEDS is turned off. However, the stub declaration in the header is provided for all configs without CONFIG_NEW_LEDS, resulting in a build failure: drivers/leds/leds-gpio-register.c:24:1: error: redefinition of 'gpio_led_register_device' 24 | gpio_led_register_device(int id, const struct gpio_led_platform_data *pdata) | ^ include/linux/leds.h:646:39: note: previous definition is here Change the #ifdef check to match the definition. Note: this apparently took years of randconfig builds to hit, since a number of other drivers just 'select NEW_LEDS' anyway. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240228093834.2230004-1-arnd@kernel.org Signed-off-by: Lee Jones --- include/linux/leds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 7598d472903a..db6b114bb3d9 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -639,7 +639,7 @@ struct gpio_led_platform_data { gpio_blink_set_t gpio_blink_set; }; -#ifdef CONFIG_NEW_LEDS +#ifdef CONFIG_LEDS_GPIO_REGISTER struct platform_device *gpio_led_register_device( int id, const struct gpio_led_platform_data *pdata); #else -- cgit v1.2.3 From 5d51a794414359dd387c3da7a2ea7602c67f84a6 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 7 Mar 2024 10:55:16 +0000 Subject: firmware: cirrus: cs_dsp: Remove non-existent member from kerneldoc The kerneldoc for struct cs_dsp refers to a fw_file_name member but there's no such member. Signed-off-by: Richard Fitzgerald Link: https://msgid.link/r/20240307105516.40250-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 29cd11d5a3cf..23384a54d575 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -123,7 +123,6 @@ struct cs_dsp_client_ops; * @sysclk_mask: Mask of frequency bits within sysclk register (ADSP1 only) * @sysclk_shift: Shift of frequency bits within sysclk register (ADSP1 only) * @alg_regions: List of currently loaded algorithm regions - * @fw_file_name: Filename of the current firmware * @fw_name: Name of the current firmware * @fw_id: ID of the current firmware, obtained from the wmfw * @fw_id_version: Version of the firmware, obtained from the wmfw -- cgit v1.2.3 From 68ac1e46425c54653ddb5f559bc37abe19071024 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 1 Mar 2024 17:43:08 +0100 Subject: net: phylink: clean the pcs_get_state documentation commit 4d72c3bb60dd ("net: phylink: strip out pre-March 2020 legacy code") dropped the mac_pcs_get_state ops in phylink_mac_ops in favor of dedicated PCS operation pcs_get_state. However, the documentation for the pcs_get_state ops was incorrectly converted and now self-references. Drop the extra comment. Signed-off-by: Maxime Chevallier Signed-off-by: Paolo Abeni --- include/linux/phylink.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 6ba411732a0d..9a57deefcb07 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -480,9 +480,6 @@ void pcs_disable(struct phylink_pcs *pcs); * negotiation completion state in @state->an_complete, and link up state * in @state->link. If possible, @state->lp_advertising should also be * populated. - * - * When present, this overrides pcs_get_state() in &struct - * phylink_pcs_ops. */ void pcs_get_state(struct phylink_pcs *pcs, struct phylink_link_state *state); -- cgit v1.2.3 From 14fe5a98fb24192f73639590d9d3cdb5640d48db Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 Mar 2024 17:01:00 +0200 Subject: spi: Fix types of the last chip select storage variables First of all, last_cs_index_mask should be aligned with the original cs_index_mask, which is 16-bit (for now) wide. Use the same pattern for the last_cs_index_mask. Second, last_cs can be negative and since 'char' is equal to 'unsigned char' in the kernel, it's incorrect, strictly speaking, to assign signed number to it. Use s8 type as it's done for *_native_cs ones. With this change, regroup a bit the ordering to avoid too much memory space to be wasted due to paddings. Shuffle kernel documentation accordignly. Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20240307150256.3789138-3-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index ddfb66dd4caf..b05d5a87c313 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -450,9 +450,11 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * the @cur_msg_completion. This flag is used to signal the context that * is running spi_finalize_current_message() that it needs to complete() * @cur_msg_mapped: message has been mapped for DMA + * @fallback: fallback to PIO if DMA transfer return failure with + * SPI_TRANS_FAIL_NO_START. + * @last_cs_mode_high: was (mode & SPI_CS_HIGH) true on the last call to set_cs. * @last_cs: the last chip_select that is recorded by set_cs, -1 on non chip * selected - * @last_cs_mode_high: was (mode & SPI_CS_HIGH) true on the last call to set_cs. * @xfer_completion: used by core transfer_one_message() * @busy: message pump is busy * @running: message pump is running @@ -529,8 +531,6 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * If the driver does not set this, the SPI core takes the snapshot as * close to the driver hand-over as possible. * @irq_flags: Interrupt enable state during PTP system timestamping - * @fallback: fallback to PIO if DMA transfer return failure with - * SPI_TRANS_FAIL_NO_START. * @queue_empty: signal green light for opportunistically skipping the queue * for spi_sync transfers. * @must_async: disable all fast paths in the core @@ -710,10 +710,10 @@ struct spi_controller { bool rt; bool auto_runtime_pm; bool cur_msg_mapped; - char last_cs[SPI_CS_CNT_MAX]; - char last_cs_index_mask; - bool last_cs_mode_high; bool fallback; + bool last_cs_mode_high; + s8 last_cs[SPI_CS_CNT_MAX]; + u32 last_cs_index_mask : SPI_CS_CNT_MAX; struct completion xfer_completion; size_t max_dma_len; -- cgit v1.2.3 From ab23f1bffcf690ffae2b0c4bb8b09420299be05c Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Sat, 24 Feb 2024 11:44:03 +0000 Subject: slimbus: core: make slimbus_bus const Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the slimbus_bus variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: "Ricardo B. Marliere" Reviewed-by: Greg Kroah-Hartman Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20240224114403.86230-3-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/slimbus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slimbus.h b/include/linux/slimbus.h index 12c9719b2a55..3042385b7b40 100644 --- a/include/linux/slimbus.h +++ b/include/linux/slimbus.h @@ -10,7 +10,7 @@ #include #include -extern struct bus_type slimbus_bus; +extern const struct bus_type slimbus_bus; /** * struct slim_eaddr - Enumeration address for a SLIMbus device -- cgit v1.2.3 From e34b943068d30f20db31f28100affdaaedc7efab Mon Sep 17 00:00:00 2001 From: Praveen Teja Kundanala Date: Sat, 24 Feb 2024 11:45:10 +0000 Subject: firmware: xilinx: Add ZynqMP efuse access API Add zynqmp_pm_efuse_access API in the ZynqMP firmware for read/write access of efuse memory. Signed-off-by: Praveen Teja Kundanala Acked-by: Michal Simek Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20240224114516.86365-6-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 9a7e52739251..1a069a56c961 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -3,6 +3,7 @@ * Xilinx Zynq MPSoC Firmware layer * * Copyright (C) 2014-2021 Xilinx + * Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. * * Michal Simek * Davorin Mista @@ -171,6 +172,7 @@ enum pm_api_id { PM_CLOCK_GETPARENT = 44, PM_FPGA_READ = 46, PM_SECURE_AES = 47, + PM_EFUSE_ACCESS = 53, PM_FEATURE_CHECK = 63, }; @@ -562,6 +564,7 @@ int zynqmp_pm_set_requirement(const u32 node, const u32 capabilities, const u32 qos, const enum zynqmp_pm_request_ack ack); int zynqmp_pm_aes_engine(const u64 address, u32 *out); +int zynqmp_pm_efuse_access(const u64 address, u32 *out); int zynqmp_pm_sha_hash(const u64 address, const u32 size, const u32 flags); int zynqmp_pm_fpga_load(const u64 address, const u32 size, const u32 flags); int zynqmp_pm_fpga_get_status(u32 *value); @@ -749,6 +752,11 @@ static inline int zynqmp_pm_aes_engine(const u64 address, u32 *out) return -ENODEV; } +static inline int zynqmp_pm_efuse_access(const u64 address, u32 *out) +{ + return -ENODEV; +} + static inline int zynqmp_pm_sha_hash(const u64 address, const u32 size, const u32 flags) { -- cgit v1.2.3 From cb1c1224193e648b4108dd06ebb7cc86b5c514ad Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Mon, 12 Feb 2024 08:41:01 -0300 Subject: dio: make dio_bus_type const Now that the driver core can properly handle constant struct bus_type, move the dio_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: "Ricardo B. Marliere" Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240212-bus_cleanup-dio-v2-1-3b1ba4c0547d@marliere.net Signed-off-by: Greg Kroah-Hartman --- include/linux/dio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dio.h b/include/linux/dio.h index 5abd07361eb5..2b5923909f96 100644 --- a/include/linux/dio.h +++ b/include/linux/dio.h @@ -68,7 +68,7 @@ struct dio_bus { }; extern struct dio_bus dio_bus; /* Single DIO bus */ -extern struct bus_type dio_bus_type; +extern const struct bus_type dio_bus_type; /* * DIO device IDs -- cgit v1.2.3 From d843f031d9e90462253015bc0bd9e3852d206bf2 Mon Sep 17 00:00:00 2001 From: Wayne Chang Date: Thu, 7 Mar 2024 11:03:27 +0800 Subject: phy: tegra: xusb: Add API to retrieve the port number of phy This patch introduces a new API, tegra_xusb_padctl_get_port_number, to the Tegra XUSB Pad Controller driver. This API is used to identify the USB port that is associated with a given PHY. The function takes a PHY pointer for either a USB2 PHY or USB3 PHY as input and returns the corresponding port number. If the PHY pointer is invalid, it returns -ENODEV. Cc: stable@vger.kernel.org Signed-off-by: Wayne Chang Reviewed-by: Jon Hunter Tested-by: Jon Hunter Link: https://lore.kernel.org/r/20240307030328.1487748-2-waynec@nvidia.com Signed-off-by: Greg Kroah-Hartman --- include/linux/phy/tegra/xusb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy/tegra/xusb.h b/include/linux/phy/tegra/xusb.h index 70998e6dd6fd..6ca51e0080ec 100644 --- a/include/linux/phy/tegra/xusb.h +++ b/include/linux/phy/tegra/xusb.h @@ -26,6 +26,7 @@ void tegra_phy_xusb_utmi_pad_power_down(struct phy *phy); int tegra_phy_xusb_utmi_port_reset(struct phy *phy); int tegra_xusb_padctl_get_usb3_companion(struct tegra_xusb_padctl *padctl, unsigned int port); +int tegra_xusb_padctl_get_port_number(struct phy *phy); int tegra_xusb_padctl_enable_phy_sleepwalk(struct tegra_xusb_padctl *padctl, struct phy *phy, enum usb_device_speed speed); int tegra_xusb_padctl_disable_phy_sleepwalk(struct tegra_xusb_padctl *padctl, struct phy *phy); -- cgit v1.2.3 From a13bd6f3c936edff957f2d02cf65c44046cb1243 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Mon, 26 Feb 2024 18:05:19 -0300 Subject: greybus: move is_gb_* functions out of greybus.h The functions below are only used within the context of drivers/greybus/core.c, so move them all into core and drop their 'inline' specifiers: is_gb_host_device(), is_gb_module(), is_gb_interface(), is_gb_control(), is_gb_bundle() and is_gb_svc(). Suggested-by: Alex Elder Cc: Greg Kroah-Hartman Signed-off-by: "Ricardo B. Marliere" Reviewed-by: Alex Elder Link: https://lore.kernel.org/r/20240226-device_cleanup-greybus2-v1-1-5f7d1161e684@marliere.net Signed-off-by: Greg Kroah-Hartman --- include/linux/greybus.h | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/greybus.h b/include/linux/greybus.h index 2cc570ea63bf..634c9511cf78 100644 --- a/include/linux/greybus.h +++ b/include/linux/greybus.h @@ -113,36 +113,6 @@ extern const struct device_type greybus_control_type; extern const struct device_type greybus_bundle_type; extern const struct device_type greybus_svc_type; -static inline int is_gb_host_device(const struct device *dev) -{ - return dev->type == &greybus_hd_type; -} - -static inline int is_gb_module(const struct device *dev) -{ - return dev->type == &greybus_module_type; -} - -static inline int is_gb_interface(const struct device *dev) -{ - return dev->type == &greybus_interface_type; -} - -static inline int is_gb_control(const struct device *dev) -{ - return dev->type == &greybus_control_type; -} - -static inline int is_gb_bundle(const struct device *dev) -{ - return dev->type == &greybus_bundle_type; -} - -static inline int is_gb_svc(const struct device *dev) -{ - return dev->type == &greybus_svc_type; -} - static inline bool cport_id_valid(struct gb_host_device *hd, u16 cport_id) { return cport_id != CPORT_ID_BAD && cport_id < hd->num_cports; -- cgit v1.2.3 From 0e439ba38e615e505404b3935585f1898bafaea9 Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Mon, 26 Feb 2024 13:58:16 +0530 Subject: cdx: add MSI support for CDX bus Add CDX-MSI domain per CDX controller with gic-its domain as a parent, to support MSI for CDX devices. CDX devices allocate MSIs from the CDX domain. Also, introduce APIs to alloc and free IRQs for CDX domain. In CDX subsystem firmware is a controller for all devices and their configuration. CDX bus controller sends all the write_msi_msg commands to firmware running on RPU and the firmware interfaces with actual devices to pass this information to devices Since, CDX controller is the only way to communicate with the Firmware for MSI write info, CDX domain per controller required in contrast to having a CDX domain per device. Co-developed-by: Nikhil Agarwal Signed-off-by: Nikhil Agarwal Co-developed-by: Abhijit Gangurde Signed-off-by: Abhijit Gangurde Signed-off-by: Nipun Gupta Reviewed-by: Pieter Jansen van Vuuren Reviewed-by: Thomas Gleixner Tested-by: Nikhil Agarwal Link: https://lore.kernel.org/r/20240226082816.100872-1-nipun.gupta@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/cdx/cdx_bus.h | 53 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index 6355a36a3f81..b57118aaa679 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -12,6 +12,7 @@ #include #include #include +#include #define MAX_CDX_DEV_RESOURCES 4 #define CDX_CONTROLLER_ID_SHIFT 4 @@ -21,13 +22,25 @@ struct cdx_controller; enum { + CDX_DEV_MSI_CONF, CDX_DEV_BUS_MASTER_CONF, CDX_DEV_RESET_CONF, + CDX_DEV_MSI_ENABLE, +}; + +struct cdx_msi_config { + u64 addr; + u32 data; + u16 msi_index; }; struct cdx_device_config { u8 type; - bool bus_master_enable; + union { + struct cdx_msi_config msi; + bool bus_master_enable; + bool msi_enable; + }; }; typedef int (*cdx_bus_enable_cb)(struct cdx_controller *cdx, u8 bus_num); @@ -87,6 +100,7 @@ struct cdx_ops { * struct cdx_controller: CDX controller object * @dev: Linux device associated with the CDX controller. * @priv: private data + * @msi_domain: MSI domain * @id: Controller ID * @controller_registered: controller registered with bus * @ops: CDX controller ops @@ -94,6 +108,7 @@ struct cdx_ops { struct cdx_controller { struct device *dev; void *priv; + struct irq_domain *msi_domain; u32 id; bool controller_registered; struct cdx_ops *ops; @@ -120,9 +135,13 @@ struct cdx_controller { * @req_id: Requestor ID associated with CDX device * @is_bus: Is this bus device * @enabled: is this bus enabled + * @msi_dev_id: MSI Device ID associated with CDX device + * @num_msi: Number of MSI's supported by the device * @driver_override: driver name to force a match; do not set directly, * because core frees it; use driver_set_override() to * set or clear it. + * @irqchip_lock: lock to synchronize irq/msi configuration + * @msi_write_pending: MSI write pending for this device */ struct cdx_device { struct device dev; @@ -144,7 +163,11 @@ struct cdx_device { u32 req_id; bool is_bus; bool enabled; + u32 msi_dev_id; + u32 num_msi; const char *driver_override; + struct mutex irqchip_lock; + bool msi_write_pending; }; #define to_cdx_device(_dev) \ @@ -237,4 +260,32 @@ int cdx_set_master(struct cdx_device *cdx_dev); */ int cdx_clear_master(struct cdx_device *cdx_dev); +#ifdef CONFIG_GENERIC_MSI_IRQ +/** + * cdx_enable_msi - Enable MSI for the CDX device. + * @cdx_dev: device pointer + * + * Return: 0 for success, -errno on failure + */ +int cdx_enable_msi(struct cdx_device *cdx_dev); + +/** + * cdx_disable_msi - Disable MSI for the CDX device. + * @cdx_dev: device pointer + */ +void cdx_disable_msi(struct cdx_device *cdx_dev); + +#else /* CONFIG_GENERIC_MSI_IRQ */ + +static inline int cdx_enable_msi(struct cdx_device *cdx_dev) +{ + return -ENODEV; +} + +static inline void cdx_disable_msi(struct cdx_device *cdx_dev) +{ +} + +#endif /* CONFIG_GENERIC_MSI_IRQ */ + #endif /* _CDX_BUS_H_ */ -- cgit v1.2.3 From 576882ef5e7fce030b65c92b508a0f84ea5a81c2 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Mon, 5 Feb 2024 12:01:37 -0800 Subject: uio: introduce UIO_MEM_DMA_COHERENT type Add a UIO memtype specifically for sharing dma_alloc_coherent memory with userspace, backed by dma_mmap_coherent. This is mainly for the bnx2/bnx2x/bnx2i "cnic" interface, although there are a few other uio drivers which map dma_alloc_coherent memory and will be converted to use dma_mmap_coherent as well. Signed-off-by: Nilesh Javali Signed-off-by: Chris Leech Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240205200137.138302-1-cleech@redhat.com Signed-off-by: Greg Kroah-Hartman --- include/linux/uio_driver.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 47c5962b876b..18238dc8bfd3 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -28,19 +28,26 @@ struct uio_map; * logical, virtual, or physical & phys_addr_t * should always be large enough to handle any of * the address types) + * @dma_addr: DMA handle set by dma_alloc_coherent, used with + * UIO_MEM_DMA_COHERENT only (@addr should be the + * void * returned from the same dma_alloc_coherent call) * @offs: offset of device memory within the page * @size: size of IO (multiple of page size) * @memtype: type of memory addr points to * @internal_addr: ioremap-ped version of addr, for driver internal use + * @dma_device: device struct that was passed to dma_alloc_coherent, + * used with UIO_MEM_DMA_COHERENT only * @map: for use by the UIO core only. */ struct uio_mem { const char *name; phys_addr_t addr; + dma_addr_t dma_addr; unsigned long offs; resource_size_t size; int memtype; void __iomem *internal_addr; + struct device *dma_device; struct uio_map *map; }; @@ -158,6 +165,12 @@ extern int __must_check #define UIO_MEM_LOGICAL 2 #define UIO_MEM_VIRTUAL 3 #define UIO_MEM_IOVA 4 +/* + * UIO_MEM_DMA_COHERENT exists for legacy drivers that had been getting by with + * improperly mapping DMA coherent allocations through the other modes. + * Do not use in new drivers. + */ +#define UIO_MEM_DMA_COHERENT 5 /* defines for uio_port->porttype */ #define UIO_PORT_NONE 0 -- cgit v1.2.3 From 8dde8fa0cc3edce73c050b9882d06c1a575f6402 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 17 Jan 2024 00:33:07 -0800 Subject: firmware_loader: introduce __free() cleanup hanler Define cleanup handler using facilities from linux/cleanup.h to simplify error handling in code using firmware loader. This will allow writing code like this: int driver_update_firmware(...) { const struct firmware *fw_entry __free(firmware) = NULL; int error; ... error = request_firmware(&fw_entry, fw_name, dev); if (error) { dev_err(dev, "failed to request firmware %s: %d", fw_name, error); return error; } error = check_firmware_valid(fw_entry); if (error) return error; guard(mutex)(&instance->lock); error = use_firmware(instance, fw); if (error) return error; return 0; } Signed-off-by: Dmitry Torokhov Acked-by: Luis Chamberalin Link: https://lore.kernel.org/r/ZaeQw7VXhnirX4pQ@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 0311858b46ce..f026f8926d79 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -4,6 +4,7 @@ #include #include +#include #include #define FW_ACTION_NOUEVENT 0 @@ -198,4 +199,6 @@ static inline void firmware_upload_unregister(struct fw_upload *fw_upload) int firmware_request_cache(struct device *device, const char *name); +DEFINE_FREE(firmware, struct firmware *, release_firmware(_T)) + #endif -- cgit v1.2.3 From bbf6cfba49a117c502ec5df66d3ab3b485c113f8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:05 +0200 Subject: driver core: Drop unneeded 'extern' keyword in fwnode.h We do not use 'extern' keyword with functions. Remove the last one mistakenly added to fwnode.h. Reviewed-by: Sakari Ailus Acked-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 2a72f55d26eb..2d23a14857c7 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -209,9 +209,9 @@ static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode, fwnode->flags &= ~FWNODE_FLAG_INITIALIZED; } -extern bool fw_devlink_is_strict(void); int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup); void fwnode_links_purge(struct fwnode_handle *fwnode); void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); +bool fw_devlink_is_strict(void); #endif -- cgit v1.2.3 From 1c4002aeab3c81afa8a00ae76b1ea38d066e9978 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:06 +0200 Subject: driver core: Move fw_devlink stuff to where it belongs A few APIs, i.e. fwnode_is_ancestor_of(), fwnode_get_next_parent_dev(), and get_dev_from_fwnode(), that belong specifically to the fw_devlink APIs, may be static, but they are not. Resolve this mess by moving them to the driver/base/core where the all users are being resided and make static. No functional changes intended. Reviewed-by: Sakari Ailus Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 1 - include/linux/property.h | 2 -- 2 files changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 2d23a14857c7..416cbe72f0c7 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -187,7 +187,6 @@ struct fwnode_operations { if (fwnode_has_op(fwnode, op)) \ (fwnode)->ops->op(fwnode, ## __VA_ARGS__); \ } while (false) -#define get_dev_from_fwnode(fwnode) get_device((fwnode)->dev) static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) diff --git a/include/linux/property.h b/include/linux/property.h index e6516d0b7d52..284ff79ebf03 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -156,11 +156,9 @@ struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode); for (parent = fwnode_get_parent(fwnode); parent; \ parent = fwnode_get_next_parent(parent)) -struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode); unsigned int fwnode_count_parents(const struct fwnode_handle *fwn); struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwn, unsigned int depth); -bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor, const struct fwnode_handle *child); struct fwnode_handle *fwnode_get_next_child_node( const struct fwnode_handle *fwnode, struct fwnode_handle *child); struct fwnode_handle *fwnode_get_next_available_child_node( -- cgit v1.2.3 From 420b104dd116cddd1615588a400b557bf4e436b4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:07 +0200 Subject: device property: Move enum dev_dma_attr to fwnode.h The struct fwnode_operations defines one of the callback to return enum dev_dma_attr. But this currently is defined in property.h. Move it to the correct location. Reviewed-by: Sakari Ailus Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-4-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 6 ++++++ include/linux/property.h | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 416cbe72f0c7..4228c45d5ccc 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -14,6 +14,12 @@ #include #include +enum dev_dma_attr { + DEV_DMA_NOT_SUPPORTED, + DEV_DMA_NON_COHERENT, + DEV_DMA_COHERENT, +}; + struct fwnode_operations; struct device; diff --git a/include/linux/property.h b/include/linux/property.h index 284ff79ebf03..1f0135e24d00 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -27,12 +27,6 @@ enum dev_prop_type { DEV_PROP_REF, }; -enum dev_dma_attr { - DEV_DMA_NOT_SUPPORTED, - DEV_DMA_NON_COHERENT, - DEV_DMA_COHERENT, -}; - const struct fwnode_handle *__dev_fwnode_const(const struct device *dev); struct fwnode_handle *__dev_fwnode(struct device *dev); #define dev_fwnode(dev) \ -- cgit v1.2.3 From 4dc3d612ee5c3be2a4d1a73ab31bcfaaa850aa19 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Mar 2024 20:00:08 +0200 Subject: device property: Don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Reviewed-by: Sakari Ailus Acked-by: "Rafael J. Wysocki" Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240301180138.271590-5-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 4 ++-- include/linux/property.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 4228c45d5ccc..80f3cd91b471 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -9,10 +9,10 @@ #ifndef _LINUX_FWNODE_H_ #define _LINUX_FWNODE_H_ -#include -#include #include #include +#include +#include enum dev_dma_attr { DEV_DMA_NOT_SUPPORTED, diff --git a/include/linux/property.h b/include/linux/property.h index 1f0135e24d00..3a1045eb786c 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -11,6 +11,7 @@ #define _LINUX_PROPERTY_H_ #include +#include #include #include #include -- cgit v1.2.3 From 75cde56a5b504d07a64ce0e3f8c7410df70308a3 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 4 Mar 2024 21:04:54 -0800 Subject: driver core: Adds flags param to fwnode_link_add() Allow the callers to set fwnode link flags when adding fwnode links. Signed-off-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240305050458.1400667-2-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 80f3cd91b471..70d9c40269b9 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -214,7 +214,8 @@ static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode, fwnode->flags &= ~FWNODE_FLAG_INITIALIZED; } -int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup); +int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup, + u8 flags); void fwnode_links_purge(struct fwnode_handle *fwnode); void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); bool fw_devlink_is_strict(void); -- cgit v1.2.3 From b7e1241d8f77ed64404a5e4450f43a319310fc91 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 4 Mar 2024 21:04:55 -0800 Subject: driver core: Add FWLINK_FLAG_IGNORE to completely ignore a fwnode link A fwnode link between specific supplier-consumer fwnodes can be added multiple times for multiple reasons. If that dependency doesn't exist, deleting the fwnode link once doesn't guarantee that it won't get created again. So, add FWLINK_FLAG_IGNORE flag to mark a fwnode link as one that needs to be completely ignored. Since a fwnode link's flags is an OR of all the flags passed to all the fwnode_link_add() calls to create that specific fwnode link, the FWLINK_FLAG_IGNORE flag is preserved and can be used to mark a fwnode link as on that need to be completely ignored until it is deleted. Signed-off-by: Saravana Kannan Acked-by: "Rafael J. Wysocki" Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240305050458.1400667-3-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 70d9c40269b9..0d79070c5a70 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -59,8 +59,10 @@ struct fwnode_handle { * fwnode link flags * * CYCLE: The fwnode link is part of a cycle. Don't defer probe. + * IGNORE: Completely ignore this link, even during cycle detection. */ #define FWLINK_FLAG_CYCLE BIT(0) +#define FWLINK_FLAG_IGNORE BIT(1) struct fwnode_link { struct fwnode_handle *supplier; -- cgit v1.2.3 From cf2c2e4a3d910270903d50462aaa75140cdb2c96 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Mar 2024 19:12:25 -0800 Subject: bpf: Plumb get_unmapped_area() callback into bpf_map_ops Subsequent patches introduce bpf_arena that imposes special alignment requirements on address selection. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20240307031228.42896-4-alexei.starovoitov@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 785660810e6a..95e07673cdc1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -139,6 +139,9 @@ struct bpf_map_ops { int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); + unsigned long (*map_get_unmapped_area)(struct file *filep, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); /* Functions called by bpf_local_storage maps */ int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, -- cgit v1.2.3 From 2658b5a8a4eee5fad378d0bde2f221deacbc58f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:14 +0000 Subject: net: introduce struct net_hotdata Instead of spreading networking critical fields all over the places, add a custom net_hotdata structure so that we can precisely control its layout. In this first patch, move : - gro_normal_batch used in rx (GRO stack) - offload_base used in rx and tx (GRO and TSO stacks) Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2767467138a0..6643452af543 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4796,7 +4796,6 @@ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; extern int dev_rx_weight; extern int dev_tx_weight; -extern int gro_normal_batch; enum { NESTED_SYNC_IMM_BIT, -- cgit v1.2.3 From 0b91fa4bfb1caedd01cb6eb3b733cbc77c9edb0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:17 +0000 Subject: net: move ptype_all into net_hotdata ptype_all is used in rx/tx fast paths. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6643452af543..b18ac8072f18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5306,7 +5306,6 @@ static inline const char *netdev_reg_state(const struct net_device *dev) #define PTYPE_HASH_SIZE (16) #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) -extern struct list_head ptype_all __read_mostly; extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; -- cgit v1.2.3 From edbc666cdcbf4a80ada4311c272a2078af87b880 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:18 +0000 Subject: net: move netdev_max_backlog to net_hotdata netdev_max_backlog is used in rx fat path. Move it to net_hodata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b18ac8072f18..c9a671b7bb37 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4793,7 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); -extern int netdev_max_backlog; extern int dev_rx_weight; extern int dev_tx_weight; -- cgit v1.2.3 From 26722dc74bf08fd79564cbcad1e5f3e2aa3bf9cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:21 +0000 Subject: net: move dev_tx_weight to net_hotdata dev_tx_weight is used in tx fast path. Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c9a671b7bb37..ad4b031098ff 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4794,7 +4794,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int dev_rx_weight; -extern int dev_tx_weight; enum { NESTED_SYNC_IMM_BIT, -- cgit v1.2.3 From 71c0de9bac9c1dda503322c86be4924f055dc6c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:22 +0000 Subject: net: move dev_rx_weight to net_hotdata dev_rx_weight is read from process_backlog(). Move it to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ad4b031098ff..dd641297e807 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4793,8 +4793,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); -extern int dev_rx_weight; - enum { NESTED_SYNC_IMM_BIT, NESTED_SYNC_TODO_BIT, -- cgit v1.2.3 From aa70d2d16f280efe8aa52afc25a33b2ec8d346b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:23 +0000 Subject: net: move skbuff_cache(s) to net_hotdata skbuff_cache, skbuff_fclone_cache and skb_small_head_cache are used in rx/tx fast paths. Move them to net_hotdata for better cache locality. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-11-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3013355b63f5..d0508f90bed5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1271,7 +1271,6 @@ static inline void consume_skb(struct sk_buff *skb) void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); -extern struct kmem_cache *skbuff_cache; void kfree_skb_partial(struct sk_buff *skb, bool head_stolen); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, -- cgit v1.2.3 From 490a79faf95e705ba0ffd9ebf04a624b379e53c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Mar 2024 16:00:30 +0000 Subject: net: introduce include/net/rps.h Move RPS related structures and helpers from include/linux/netdevice.h and include/net/sock.h to a new include file. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240306160031.874438-18-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 82 ----------------------------------------------- 1 file changed, 82 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd641297e807..416a800d72ba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -225,12 +225,6 @@ struct net_device_core_stats { #include #include -#ifdef CONFIG_RPS -#include -extern struct static_key_false rps_needed; -extern struct static_key_false rfs_needed; -#endif - struct neighbour; struct neigh_parms; struct sk_buff; @@ -730,86 +724,10 @@ static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node #endif } -#ifdef CONFIG_RPS -/* - * This structure holds an RPS map which can be of variable length. The - * map is an array of CPUs. - */ -struct rps_map { - unsigned int len; - struct rcu_head rcu; - u16 cpus[]; -}; -#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) - -/* - * The rps_dev_flow structure contains the mapping of a flow to a CPU, the - * tail pointer for that CPU's input queue at the time of last enqueue, and - * a hardware filter index. - */ -struct rps_dev_flow { - u16 cpu; - u16 filter; - unsigned int last_qtail; -}; -#define RPS_NO_FILTER 0xffff - -/* - * The rps_dev_flow_table structure contains a table of flow mappings. - */ -struct rps_dev_flow_table { - unsigned int mask; - struct rcu_head rcu; - struct rps_dev_flow flows[]; -}; -#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ - ((_num) * sizeof(struct rps_dev_flow))) - -/* - * The rps_sock_flow_table contains mappings of flows to the last CPU - * on which they were processed by the application (set in recvmsg). - * Each entry is a 32bit value. Upper part is the high-order bits - * of flow hash, lower part is CPU number. - * rps_cpu_mask is used to partition the space, depending on number of - * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 - * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, - * meaning we use 32-6=26 bits for the hash. - */ -struct rps_sock_flow_table { - u32 mask; - - u32 ents[] ____cacheline_aligned_in_smp; -}; -#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) - -#define RPS_NO_CPU 0xffff - -extern u32 rps_cpu_mask; -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; - -static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, - u32 hash) -{ - if (table && hash) { - unsigned int index = hash & table->mask; - u32 val = hash & ~rps_cpu_mask; - - /* We only give a hint, preemption can change CPU under us */ - val |= raw_smp_processor_id(); - - /* The following WRITE_ONCE() is paired with the READ_ONCE() - * here, and another one in get_rps_cpu(). - */ - if (READ_ONCE(table->ents[index]) != val) - WRITE_ONCE(table->ents[index], val); - } -} - #ifdef CONFIG_RFS_ACCEL bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif -#endif /* CONFIG_RPS */ /* XPS map type and offset of the xps map within net_device->xps_maps[]. */ enum xps_map_type { -- cgit v1.2.3 From ab63a2387cb906d43b72a8effb611bbaecb2d0cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Mar 2024 11:55:07 -0800 Subject: netdev: add per-queue statistics The ethtool-nl family does a good job exposing various protocol related and IEEE/IETF statistics which used to get dumped under ethtool -S, with creative names. Queue stats don't have a netlink API, yet, and remain a lion's share of ethtool -S output for new drivers. Not only is that bad because the names differ driver to driver but it's also bug-prone. Intuitively drivers try to report only the stats for active queues, but querying ethtool stats involves multiple system calls, and the number of stats is read separately from the stats themselves. Worse still when user space asks for values of the stats, it doesn't inform the kernel how big the buffer is. If number of stats increases in the meantime kernel will overflow user buffer. Add a netlink API for dumping queue stats. Queue information is exposed via the netdev-genl family, so add the stats there. Support per-queue and sum-for-device dumps. Latter will be useful when subsequent patches add more interesting common stats than just bytes and packets. The API does not currently distinguish between HW and SW stats. The expectation is that the source of the stats will either not matter much (good packets) or be obvious (skb alloc errors). Acked-by: Stanislav Fomichev Reviewed-by: Amritha Nambiar Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/20240306195509.1502746-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 416a800d72ba..4230c7f3b959 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1955,6 +1955,7 @@ enum netdev_reg_state { * * @sysfs_rx_queue_group: Space for optional per-rx queue attributes * @rtnl_link_ops: Rtnl_link_ops + * @stat_ops: Optional ops for queue-aware statistics * * @gso_max_size: Maximum size of generic segmentation offload * @tso_max_size: Device (as in HW) limit on the max TSO request size @@ -2335,6 +2336,8 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; + const struct netdev_stat_ops *stat_ops; + /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u -- cgit v1.2.3 From 6025b9135f7a8b46826a5fcf947259da43bac281 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Mar 2024 06:08:47 -0800 Subject: net: dqs: add NIC stall detector based on BQL softnet_data->time_squeeze is sometimes used as a proxy for host overload or indication of scheduling problems. In practice this statistic is very noisy and has hard to grasp units - e.g. is 10 squeezes a second to be expected, or high? Delaying network (NAPI) processing leads to drops on NIC queues but also RTT bloat, impacting pacing and CA decisions. Stalls are a little hard to detect on the Rx side, because there may simply have not been any packets received in given period of time. Packet timestamps help a little bit, but again we don't know if packets are stale because we're not keeping up or because someone (*cough* cgroups) disabled IRQs for a long time. We can, however, use Tx as a proxy for Rx stalls. Most drivers use combined Rx+Tx NAPIs so if Tx gets starved so will Rx. On the Tx side we know exactly when packets get queued, and completed, so there is no uncertainty. This patch adds stall checks to BQL. Why BQL? Because it's a convenient place to add such checks, already called by most drivers, and it has copious free space in its structures (this patch adds no extra cache references or dirtying to the fast path). The algorithm takes one parameter - max delay AKA stall threshold and increments a counter whenever NAPI got delayed for at least that amount of time. It also records the length of the longest stall. To be precise every time NAPI has not polled for at least stall thrs we check if there were any Tx packets queued between last NAPI run and now - stall_thrs/2. Unlike the classic Tx watchdog this mechanism does not ignore stalls caused by Tx being disabled, or loss of link. I don't think the check is worth the complexity, and stall is a stall, whether due to host overload, flow control, link down... doesn't matter much to the application. We have been running this detector in production at Meta for 2 years, with the threshold of 8ms. It's the lowest value where false positives become rare. There's still a constant stream of reported stalls (especially without the ksoftirqd deferral patches reverted), those who like their stall metrics to be 0 may prefer higher value. Signed-off-by: Jakub Kicinski Signed-off-by: Breno Leitao Signed-off-by: David S. Miller --- include/linux/dynamic_queue_limits.h | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h index 407c2f281b64..5693a4be0d9a 100644 --- a/include/linux/dynamic_queue_limits.h +++ b/include/linux/dynamic_queue_limits.h @@ -38,14 +38,22 @@ #ifdef __KERNEL__ +#include #include +#define DQL_HIST_LEN 4 +#define DQL_HIST_ENT(dql, idx) ((dql)->history[(idx) % DQL_HIST_LEN]) + struct dql { /* Fields accessed in enqueue path (dql_queued) */ unsigned int num_queued; /* Total ever queued */ unsigned int adj_limit; /* limit + num_completed */ unsigned int last_obj_cnt; /* Count at last queuing */ + unsigned long history_head; /* top 58 bits of jiffies */ + /* stall entries, a bit per entry */ + unsigned long history[DQL_HIST_LEN]; + /* Fields accessed only by completion path (dql_completed) */ unsigned int limit ____cacheline_aligned_in_smp; /* Current limit */ @@ -62,6 +70,13 @@ struct dql { unsigned int max_limit; /* Max limit */ unsigned int min_limit; /* Minimum limit */ unsigned int slack_hold_time; /* Time to measure slack */ + + /* Stall threshold (in jiffies), defined by user */ + unsigned short stall_thrs; + /* Longest stall detected, reported to user */ + unsigned short stall_max; + unsigned long last_reap; /* Last reap (in jiffies) */ + unsigned long stall_cnt; /* Number of stalls */ }; /* Set some static maximums */ @@ -74,6 +89,8 @@ struct dql { */ static inline void dql_queued(struct dql *dql, unsigned int count) { + unsigned long map, now, now_hi, i; + BUG_ON(count > DQL_MAX_OBJECT); dql->last_obj_cnt = count; @@ -86,6 +103,34 @@ static inline void dql_queued(struct dql *dql, unsigned int count) barrier(); dql->num_queued += count; + + now = jiffies; + now_hi = now / BITS_PER_LONG; + + /* The following code set a bit in the ring buffer, where each + * bit trackes time the packet was queued. The dql->history buffer + * tracks DQL_HIST_LEN * BITS_PER_LONG time (jiffies) slot + */ + if (unlikely(now_hi != dql->history_head)) { + /* About to reuse slots, clear them */ + for (i = 0; i < DQL_HIST_LEN; i++) { + /* Multiplication masks high bits */ + if (now_hi * BITS_PER_LONG == + (dql->history_head + i) * BITS_PER_LONG) + break; + DQL_HIST_ENT(dql, dql->history_head + i + 1) = 0; + } + /* pairs with smp_rmb() in dql_check_stall() */ + smp_wmb(); + WRITE_ONCE(dql->history_head, now_hi); + } + + /* __set_bit() does not guarantee WRITE_ONCE() semantics */ + map = DQL_HIST_ENT(dql, now_hi); + + /* Populate the history with an entry (bit) per queued */ + if (!(map & BIT_MASK(now))) + WRITE_ONCE(DQL_HIST_ENT(dql, now_hi), map | BIT_MASK(now)); } /* Returns how many objects can be queued, < 0 indicates over limit. */ -- cgit v1.2.3 From 6b6ca096115e5b7a85e8313f4e68a72d52db91b3 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 15:22:28 -0300 Subject: rtc: class: make rtc_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the rtc_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Link: https://lore.kernel.org/r/20240305-class_cleanup-abelloni-v1-1-944c026137c8@marliere.net Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 5f8e438a0312..3f4d315aaec9 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -42,7 +42,7 @@ static inline time64_t rtc_tm_sub(struct rtc_time *lhs, struct rtc_time *rhs) #include #include -extern struct class *rtc_class; +extern const struct class rtc_class; /* * For these RTC methods the device parameter is the physical device -- cgit v1.2.3 From 186daf2385295acf19ecf48f4d5214cc2d925933 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Mar 2024 12:53:24 -0700 Subject: io_uring/kbuf: rename REQ_F_PARTIAL_IO to REQ_F_BL_NO_RECYCLE We only use the flag for this purpose, so rename it accordingly. This further prevents various other use cases of it, keeping it clean and consistent. Then we can also check it in one spot, when it's being attempted recycled, and remove some dead code in io_kbuf_recycle_ring(). Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d8111d64812b..e24893625085 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -470,7 +470,6 @@ enum { REQ_F_SKIP_LINK_CQES_BIT, REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, - REQ_F_PARTIAL_IO_BIT, REQ_F_APOLL_MULTISHOT_BIT, REQ_F_CLEAR_POLLIN_BIT, REQ_F_HASH_LOCKED_BIT, @@ -481,6 +480,7 @@ enum { REQ_F_CANCEL_SEQ_BIT, REQ_F_CAN_POLL_BIT, REQ_F_BL_EMPTY_BIT, + REQ_F_BL_NO_RECYCLE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -543,8 +543,6 @@ enum { REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), - /* request has already done partial IO */ - REQ_F_PARTIAL_IO = IO_REQ_FLAG(REQ_F_PARTIAL_IO_BIT), /* fast poll multishot mode */ REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), /* recvmsg special flag, clear EPOLLIN */ @@ -559,6 +557,8 @@ enum { REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), /* buffer list was empty after selection of buffer */ REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), + /* don't recycle provided buffers for this request */ + REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); -- cgit v1.2.3 From 7b937cc243e5b1df8780a0aa743ce800df6c68d1 Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Fri, 16 Feb 2024 17:05:51 -0800 Subject: of: Create of_root if no dtb provided by firmware When enabling CONFIG_OF on a platform where 'of_root' is not populated by firmware, we end up without a root node. In order to apply overlays and create subnodes of the root node, we need one. Create this root node by unflattening an empty builtin dtb. If firmware provides a flattened device tree (FDT) then the FDT is unflattened via setup_arch(). Otherwise, the call to unflatten(_and_copy)?_device_tree() will create an empty root node. We make of_have_populated_dt() return true only if the DTB was loaded by firmware so that existing callers don't change behavior after this patch. The call in the of platform code is removed because it prevents overlays from creating platform devices when the empty root node is used. [sboyd@kernel.org: Update of_have_populated_dt() to treat this empty dtb as not populated. Drop setup_of() initcall] Signed-off-by: Frank Rowand Link: https://lore.kernel.org/r/20230317053415.2254616-2-frowand.list@gmail.com Cc: Rob Herring Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240217010557.2381548-3-sboyd@kernel.org Signed-off-by: Rob Herring --- include/linux/of.h | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index a3e8e429ad7f..d5e7acdc8c8e 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -180,11 +180,6 @@ static inline bool is_of_node(const struct fwnode_handle *fwnode) &__of_fwnode_handle_node->fwnode : NULL; \ }) -static inline bool of_have_populated_dt(void) -{ - return of_root != NULL; -} - static inline bool of_node_is_root(const struct device_node *node) { return node && (node->parent == NULL); @@ -546,11 +541,6 @@ static inline struct device_node *of_find_node_with_property( #define of_fwnode_handle(node) NULL -static inline bool of_have_populated_dt(void) -{ - return false; -} - static inline struct device_node *of_get_compatible_child(const struct device_node *parent, const char *compatible) { @@ -1634,6 +1624,21 @@ static inline bool of_device_is_system_power_controller(const struct device_node return of_property_read_bool(np, "system-power-controller"); } +/** + * of_have_populated_dt() - Has DT been populated by bootloader + * + * Return: True if a DTB has been populated by the bootloader and it isn't the + * empty builtin one. False otherwise. + */ +static inline bool of_have_populated_dt(void) +{ +#ifdef CONFIG_OF + return of_property_present(of_root, "compatible"); +#else + return false; +#endif +} + /* * Overlay support */ -- cgit v1.2.3 From 1cface552a5b5f6e53a855de1a503ff958e2e253 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Mar 2024 12:34:46 +0000 Subject: net: add skb_data_unref() helper Similar to skb_unref(), add skb_data_unref() to save an expensive atomic operation (and cache line dirtying) when last reference on shinfo->dataref is released. I saw this opportunity on hosts with RAW sockets accidentally bound to UDP protocol, forcing an skb_clone() on all received packets. These RAW sockets had their receive queue full, so all clone packets were immediately dropped. When UDP recvmsg() consumes later the original skb, skb_release_data() is hitting atomic_sub_return() quite badly, because skb->clone has been set permanently. Note that this patch helps TCP TX performance, because TCP stack also use (fast) clones. This means that at least one of the two packets (the main skb or its clone) will no longer have to perform this atomic operation in skb_release_data(). Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240307123446.2302230-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d0508f90bed5..3023bc2be6a1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1237,6 +1237,24 @@ static inline bool skb_unref(struct sk_buff *skb) return true; } +static inline bool skb_data_unref(const struct sk_buff *skb, + struct skb_shared_info *shinfo) +{ + int bias; + + if (!skb->cloned) + return true; + + bias = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; + + if (atomic_read(&shinfo->dataref) == bias) + smp_rmb(); + else if (atomic_sub_return(bias, &shinfo->dataref)) + return false; + + return true; +} + void __fix_address kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); -- cgit v1.2.3 From 0a5a46a6a61be7b63c12c18495d427f91f3662a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 6 Feb 2024 15:57:15 +0200 Subject: PCI/AER: Generalize TLP Header Log reading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both AER and DPC RP PIO provide TLP Header Log registers (PCIe r6.1 secs 7.8.4 & 7.9.14) to convey error diagnostics but the struct is named after AER as the struct aer_header_log_regs. Also, not all places that handle TLP Header Log use the struct and the struct members are named individually. Generalize the struct name and members, and use it consistently where TLP Header Log is being handled so that a pcie_read_tlp_log() helper can be easily added. Link: https://lore.kernel.org/r/20240206135717.8565-3-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: drop ixgbe changes for now, tidy whitespace] Signed-off-by: Bjorn Helgaas --- include/linux/aer.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index ae0fae70d4bd..4b97f38f3fcf 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -18,11 +18,8 @@ struct pci_dev; -struct aer_header_log_regs { - u32 dw0; - u32 dw1; - u32 dw2; - u32 dw3; +struct pcie_tlp_log { + u32 dw[4]; }; struct aer_capability_regs { @@ -33,13 +30,15 @@ struct aer_capability_regs { u32 cor_status; u32 cor_mask; u32 cap_control; - struct aer_header_log_regs header_log; + struct pcie_tlp_log header_log; u32 root_command; u32 root_status; u16 cor_err_source; u16 uncor_err_source; }; +int pcie_read_tlp_log(struct pci_dev *dev, int where, struct pcie_tlp_log *log); + #if defined(CONFIG_PCIEAER) int pci_aer_clear_nonfatal_status(struct pci_dev *dev); int pcie_aer_is_native(struct pci_dev *dev); -- cgit v1.2.3 From 7a1381e8313f1f01cbecbe3fc2ddaa24fe37033a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 7 Mar 2024 15:56:10 +0100 Subject: efi/tpm: Use symbolic GUID name from spec for final events table The LINUX_EFI_ GUID identifiers are only intended to be used to refer to GUIDs that are part of the Linux implementation, and are not considered external ABI. (Famous last words). GUIDs that already have a symbolic name in the spec should use that name, to avoid confusion between firmware components. So use the official name EFI_TCG2_FINAL_EVENTS_TABLE_GUID for the TCG2 'final events' configuration table. Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilias Apalodimas Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index c74f47711f0b..464fe16411b8 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -386,6 +386,7 @@ void efi_native_runtime_setup(void); #define EFI_CONSOLE_OUT_DEVICE_GUID EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) #define APPLE_PROPERTIES_PROTOCOL_GUID EFI_GUID(0x91bd12fe, 0xf6c3, 0x44fb, 0xa5, 0xb7, 0x51, 0x22, 0xab, 0x30, 0x3a, 0xe0) #define EFI_TCG2_PROTOCOL_GUID EFI_GUID(0x607f766c, 0x7455, 0x42be, 0x93, 0x0b, 0xe4, 0xd7, 0x6d, 0xb2, 0x72, 0x0f) +#define EFI_TCG2_FINAL_EVENTS_TABLE_GUID EFI_GUID(0x1e2ed096, 0x30e2, 0x4254, 0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25) #define EFI_LOAD_FILE_PROTOCOL_GUID EFI_GUID(0x56ec3091, 0x954c, 0x11d2, 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_LOAD_FILE2_PROTOCOL_GUID EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e, 0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d) #define EFI_RT_PROPERTIES_TABLE_GUID EFI_GUID(0xeb66918a, 0x7eef, 0x402a, 0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9) @@ -411,7 +412,6 @@ void efi_native_runtime_setup(void); #define LINUX_EFI_LOADER_ENTRY_GUID EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f) #define LINUX_EFI_RANDOM_SEED_TABLE_GUID EFI_GUID(0x1ce1e5bc, 0x7ceb, 0x42f2, 0x81, 0xe5, 0x8a, 0xad, 0xf1, 0x80, 0xf5, 0x7b) #define LINUX_EFI_TPM_EVENT_LOG_GUID EFI_GUID(0xb7799cb0, 0xeca2, 0x4943, 0x96, 0x67, 0x1f, 0xae, 0x07, 0xb7, 0x47, 0xfa) -#define LINUX_EFI_TPM_FINAL_LOG_GUID EFI_GUID(0x1e2ed096, 0x30e2, 0x4254, 0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25) #define LINUX_EFI_MEMRESERVE_TABLE_GUID EFI_GUID(0x888eb0c6, 0x8ede, 0x4ff5, 0xa8, 0xf0, 0x9a, 0xee, 0x5c, 0xb9, 0x77, 0xc2) #define LINUX_EFI_INITRD_MEDIA_GUID EFI_GUID(0x5568e427, 0x68fc, 0x4f3d, 0xac, 0x74, 0xca, 0x55, 0x52, 0x31, 0xcc, 0x68) #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) -- cgit v1.2.3 From 0bbe5b0ea97aaaea6387bab89919a8654b07df27 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 15 Feb 2024 03:00:01 +0000 Subject: efi/libstub: Add Confidential Computing (CC) measurement typedefs If the virtual firmware implements TPM support, TCG2 protocol will be used for kernel measurements and event logging support. But in CC environment, not all platforms support or enable the TPM feature. UEFI specification [1] exposes protocol and interfaces used for kernel measurements in CC platforms without TPM support. More details about the EFI CC measurements and logging can be found in [1]. Link: https://uefi.org/specs/UEFI/2.10/38_Confidential_Computing.html#efi-cc-measurement-protocol [1] Signed-off-by: Kuppuswamy Sathyanarayanan [ardb: Drop code changes, keep typedefs and #define's only] Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 464fe16411b8..2493d3d4429b 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -401,6 +401,7 @@ void efi_native_runtime_setup(void); #define EFI_CERT_X509_GUID EFI_GUID(0xa5c059a1, 0x94e4, 0x4aa7, 0x87, 0xb5, 0xab, 0x15, 0x5c, 0x2b, 0xf0, 0x72) #define EFI_CERT_X509_SHA256_GUID EFI_GUID(0x3bd2a492, 0x96c0, 0x4079, 0xb4, 0x20, 0xfc, 0xf9, 0x8e, 0xf1, 0x03, 0xed) #define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) +#define EFI_CC_MEASUREMENT_PROTOCOL_GUID EFI_GUID(0x96751a3d, 0x72f4, 0x41a6, 0xa7, 0x94, 0xed, 0x5d, 0x0e, 0x67, 0xae, 0x6b) /* * This GUID is used to pass to the kernel proper the struct screen_info -- cgit v1.2.3 From d228814b1913444dfdd9a25519ed7b38a19653e2 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 15 Feb 2024 03:00:02 +0000 Subject: efi/libstub: Add get_event_log() support for CC platforms To allow event log info access after boot, EFI boot stub extracts the event log information and installs it in an EFI configuration table. Currently, EFI boot stub only supports installation of event log only for TPM 1.2 and TPM 2.0 protocols. Extend the same support for CC protocol. Since CC platform also uses TCG2 format, reuse TPM2 support code as much as possible. Link: https://uefi.org/specs/UEFI/2.10/38_Confidential_Computing.html#efi-cc-measurement-protocol [1] Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://lkml.kernel.org/r/0229a87e-fb19-4dad-99fc-4afd7ed4099a%40collabora.com [ardb: Split out final events table handling to avoid version confusion] Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 2493d3d4429b..f0d56f106b60 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -402,6 +402,7 @@ void efi_native_runtime_setup(void); #define EFI_CERT_X509_SHA256_GUID EFI_GUID(0x3bd2a492, 0x96c0, 0x4079, 0xb4, 0x20, 0xfc, 0xf9, 0x8e, 0xf1, 0x03, 0xed) #define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) #define EFI_CC_MEASUREMENT_PROTOCOL_GUID EFI_GUID(0x96751a3d, 0x72f4, 0x41a6, 0xa7, 0x94, 0xed, 0x5d, 0x0e, 0x67, 0xae, 0x6b) +#define EFI_CC_FINAL_EVENTS_TABLE_GUID EFI_GUID(0xdd4a4648, 0x2de7, 0x4665, 0x96, 0x4d, 0x21, 0xd9, 0xef, 0x5f, 0xb4, 0x46) /* * This GUID is used to pass to the kernel proper the struct screen_info -- cgit v1.2.3 From edc99a2dd3ce07f61c379e641e417c07226be5ec Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 15 Feb 2024 13:42:40 -0500 Subject: nfs: remove unused NFS_CALL macro Nothing uses this, and thank goodness, as the syntax looks horrid. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 539b57fbf3ce..d09b9773b20c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1820,13 +1820,6 @@ struct nfs_rpc_ops { void (*disable_swap)(struct inode *inode); }; -/* - * NFS_CALL(getattr, inode, (fattr)); - * into - * NFS_PROTO(inode)->getattr(fattr); - */ -#define NFS_CALL(op, inode, args) NFS_PROTO(inode)->op args - /* * Function vectors etc. for the NFS client */ -- cgit v1.2.3 From 2057a48d0dd00c6a2a94ded7df2bf1d3f2a4a0da Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 15 Feb 2024 14:57:30 -0500 Subject: sunrpc: add a struct rpc_stats arg to rpc_create_args We want to be able to have our rpc stats handled in a per network namespace manner, so add an option to rpc_create_args to specify a different rpc_stats struct instead of using the one on the rpc_program. Signed-off-by: Josef Bacik Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 5e9d1469c6fa..5321585c778f 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -139,6 +139,7 @@ struct rpc_create_args { const char *servername; const char *nodename; const struct rpc_program *program; + struct rpc_stat *stats; u32 prognumber; /* overrides program->number */ u32 version; rpc_authflavor_t authflavor; -- cgit v1.2.3 From 17f46b803d4f23c66cacce81db35fef3adb8f2af Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 1 Mar 2024 11:49:57 -0500 Subject: nfs: fix UAF in direct writes In production we have been hitting the following warning consistently ------------[ cut here ]------------ refcount_t: underflow; use-after-free. WARNING: CPU: 17 PID: 1800359 at lib/refcount.c:28 refcount_warn_saturate+0x9c/0xe0 Workqueue: nfsiod nfs_direct_write_schedule_work [nfs] RIP: 0010:refcount_warn_saturate+0x9c/0xe0 PKRU: 55555554 Call Trace: ? __warn+0x9f/0x130 ? refcount_warn_saturate+0x9c/0xe0 ? report_bug+0xcc/0x150 ? handle_bug+0x3d/0x70 ? exc_invalid_op+0x16/0x40 ? asm_exc_invalid_op+0x16/0x20 ? refcount_warn_saturate+0x9c/0xe0 nfs_direct_write_schedule_work+0x237/0x250 [nfs] process_one_work+0x12f/0x4a0 worker_thread+0x14e/0x3b0 ? ZSTD_getCParams_internal+0x220/0x220 kthread+0xdc/0x120 ? __btf_name_valid+0xa0/0xa0 ret_from_fork+0x1f/0x30 This is because we're completing the nfs_direct_request twice in a row. The source of this is when we have our commit requests to submit, we process them and send them off, and then in the completion path for the commit requests we have if (nfs_commit_end(cinfo.mds)) nfs_direct_write_complete(dreq); However since we're submitting asynchronous requests we sometimes have one that completes before we submit the next one, so we end up calling complete on the nfs_direct_request twice. The only other place we use nfs_generic_commit_list() is in __nfs_commit_inode, which wraps this call in a nfs_commit_begin(); nfs_commit_end(); Which is a common pattern for this style of completion handling, one that is also repeated in the direct code with get_dreq()/put_dreq() calls around where we process events as well as in the completion paths. Fix this by using the same pattern for the commit requests. Before with my 200 node rocksdb stress running this warning would pop every 10ish minutes. With my patch the stress test has been running for several hours without popping. Signed-off-by: Josef Bacik Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f5ce7b101146..d59116ac8209 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -611,6 +611,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); +void nfs_commit_begin(struct nfs_mds_commit_info *cinfo); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); static inline bool nfs_have_writebacks(const struct inode *inode) -- cgit v1.2.3 From 3f6d5e6a468d02676244b868b210433831846127 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Jan 2024 19:00:24 -0500 Subject: mm: introduce memalloc_flags_{save,restore} Our proliferation of memalloc_*_{save,restore} APIs is getting a bit silly, this adds a generic version and converts the existing save/restore functions to wrappers. Signed-off-by: Kent Overstreet Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Michal Hocko Cc: Darrick J. Wong Cc: linux-mm@kvack.org Acked-by: Vlastimil Babka --- include/linux/sched/mm.h | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 9a19f1b42f64..f00d7ecc2adf 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -306,6 +306,24 @@ static inline void might_alloc(gfp_t gfp_mask) might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } +/** + * memalloc_flags_save - Add a PF_* flag to current->flags, save old value + * + * This allows PF_* flags to be conveniently added, irrespective of current + * value, and then the old version restored with memalloc_flags_restore(). + */ +static inline unsigned memalloc_flags_save(unsigned flags) +{ + unsigned oldflags = ~current->flags & flags; + current->flags |= flags; + return oldflags; +} + +static inline void memalloc_flags_restore(unsigned flags) +{ + current->flags &= ~flags; +} + /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * @@ -319,9 +337,7 @@ static inline void might_alloc(gfp_t gfp_mask) */ static inline unsigned int memalloc_noio_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOIO; - current->flags |= PF_MEMALLOC_NOIO; - return flags; + return memalloc_flags_save(PF_MEMALLOC_NOIO); } /** @@ -334,7 +350,7 @@ static inline unsigned int memalloc_noio_save(void) */ static inline void memalloc_noio_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; + memalloc_flags_restore(flags); } /** @@ -350,9 +366,7 @@ static inline void memalloc_noio_restore(unsigned int flags) */ static inline unsigned int memalloc_nofs_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOFS; - current->flags |= PF_MEMALLOC_NOFS; - return flags; + return memalloc_flags_save(PF_MEMALLOC_NOFS); } /** @@ -365,32 +379,27 @@ static inline unsigned int memalloc_nofs_save(void) */ static inline void memalloc_nofs_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; + memalloc_flags_restore(flags); } static inline unsigned int memalloc_noreclaim_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC; - current->flags |= PF_MEMALLOC; - return flags; + return memalloc_flags_save(PF_MEMALLOC); } static inline void memalloc_noreclaim_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC) | flags; + memalloc_flags_restore(flags); } static inline unsigned int memalloc_pin_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_PIN; - - current->flags |= PF_MEMALLOC_PIN; - return flags; + return memalloc_flags_save(PF_MEMALLOC_PIN); } static inline void memalloc_pin_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; + memalloc_flags_restore(flags); } #ifdef CONFIG_MEMCG -- cgit v1.2.3 From eab0af905bfc3e9c05da2ca163d76a1513159aa4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Jan 2024 19:00:24 -0500 Subject: mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN Introduce PF_MEMALLOC_* equivalents of some GFP_ flags: PF_MEMALLOC_NORECLAIM -> GFP_NOWAIT PF_MEMALLOC_NOWARN -> __GFP_NOWARN Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Michal Hocko Cc: Darrick J. Wong Cc: linux-mm@kvack.org Signed-off-by: Kent Overstreet --- include/linux/sched.h | 4 ++-- include/linux/sched/mm.h | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..192e2c892040 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1636,8 +1636,8 @@ extern struct pid *cad_pid; * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ -#define PF__HOLE__00800000 0x00800000 -#define PF__HOLE__01000000 0x01000000 +#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */ +#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */ #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index f00d7ecc2adf..c29059a76052 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -236,16 +236,25 @@ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | + PF_MEMALLOC_NOFS | + PF_MEMALLOC_NORECLAIM | + PF_MEMALLOC_NOWARN | + PF_MEMALLOC_PIN))) { /* - * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precedence + * Stronger flags before weaker flags: + * NORECLAIM implies NOIO, which in turn implies NOFS */ - if (pflags & PF_MEMALLOC_NOIO) + if (pflags & PF_MEMALLOC_NORECLAIM) + flags &= ~__GFP_DIRECT_RECLAIM; + else if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + if (pflags & PF_MEMALLOC_NOWARN) + flags |= __GFP_NOWARN; + if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } -- cgit v1.2.3 From a4735d40a5da96a637af6e5bf9f6ec8b9d996acd Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Sat, 9 Mar 2024 21:10:08 -0800 Subject: Input: make input_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the input_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Link: https://lore.kernel.org/r/20240305-class_cleanup-input-v1-1-0c3d950c25db@marliere.net Signed-off-by: Dmitry Torokhov --- include/linux/input.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index de6503c0edb8..c22ac465254b 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -514,7 +514,7 @@ void input_enable_softrepeat(struct input_dev *dev, int delay, int period); bool input_device_enabled(struct input_dev *dev); -extern struct class input_class; +extern const struct class input_class; /** * struct ff_device - force-feedback part of an input device -- cgit v1.2.3 From abb3f9717a67a2666b2bc2f19543a657e3d4ad63 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 27 Feb 2024 23:04:32 +0530 Subject: OPP: Extend dev_pm_opp_data with turbo support Let's extend the dev_pm_opp_data with a turbo variable, to allow users to specify if it's a boost frequency for a dynamically added OPP. Signed-off-by: Sibi Sankar Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 76dcb7f37bcd..fa9b63c6bf0b 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -87,12 +87,14 @@ struct dev_pm_opp_config { /** * struct dev_pm_opp_data - The data to use to initialize an OPP. + * @turbo: Flag to indicate whether the OPP is to be marked turbo or not. * @level: The performance level for the OPP. Set level to OPP_LEVEL_UNSET if * level field isn't used. * @freq: The clock rate in Hz for the OPP. * @u_volt: The voltage in uV for the OPP. */ struct dev_pm_opp_data { + bool turbo; unsigned int level; unsigned long freq; unsigned long u_volt; -- cgit v1.2.3 From 838a4772bfc390a14b31c25dc4c9eb66de5f5b1a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 18 Jan 2024 16:19:13 +0530 Subject: cpufreq: Move dev_pm_opp_{init|free}_cpufreq_table() to pm_opp.h Move the declaration of functions defined in the OPP core to pm_opp.h. These were added to cpufreq.h as it was the only user of the APIs, but that was a mistake perhaps. Fix it. Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 20 -------------------- include/linux/pm_opp.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index afda5f24d3dd..8ff3e79727d8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -694,26 +694,6 @@ struct cpufreq_frequency_table { * order */ }; -#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) -int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table); -void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table **table); -#else -static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table - **table) -{ - return -EINVAL; -} - -static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, - struct cpufreq_frequency_table - **table) -{ -} -#endif - /* * cpufreq_for_each_entry - iterate over a cpufreq_frequency_table * @pos: the cpufreq_frequency_table * to use as a loop cursor. diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index fa9b63c6bf0b..065a47382302 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -16,6 +16,7 @@ #include struct clk; +struct cpufreq_frequency_table; struct regulator; struct dev_pm_opp; struct device; @@ -446,6 +447,21 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev) #endif /* CONFIG_PM_OPP */ +#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP) +int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table); +void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table); +#else +static inline int dev_pm_opp_init_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table) +{ + return -EINVAL; +} + +static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, struct cpufreq_frequency_table **table) +{ +} +#endif + + #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF) int dev_pm_opp_of_add_table(struct device *dev); int dev_pm_opp_of_add_table_indexed(struct device *dev, int index); -- cgit v1.2.3 From de5f84338970815b9fdd3497a975fb572d11e0b5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 Mar 2024 12:39:06 +0100 Subject: lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers These helpers scatters or gathers a bitmap with the help of the mask position bits parameter. bitmap_scatter() does the following: src: 0000000001011010 |||||| +------+||||| | +----+|||| | |+----+||| | || +-+|| | || | || mask: ...v..vv...v..vv ...0..11...0..10 dst: 0000001100000010 and bitmap_gather() performs this one: mask: ...v..vv...v..vv src: 0000001100000010 ^ ^^ ^ 0 | || | 10 | || > 010 | |+--> 1010 | +--> 11010 +----> 011010 dst: 0000000000011010 bitmap_gather() can the seen as the reverse bitmap_scatter() operation. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/lkml/20230926052007.3917389-3-andriy.shevchenko@linux.intel.com/ Co-developed-by: Herve Codina Signed-off-by: Herve Codina Acked-by: Yury Norov Signed-off-by: David S. Miller --- include/linux/bitmap.h | 101 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 5814e9ee40ba..00b8c6d4355c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -63,6 +63,8 @@ struct device; * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest * bitmap_replace(dst, old, new, mask, nbits) *dst = (*old & ~(*mask)) | (*new & *mask) + * bitmap_scatter(dst, src, mask, nbits) *dst = map(dense, sparse)(src) + * bitmap_gather(dst, src, mask, nbits) *dst = map(sparse, dense)(src) * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) * bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap @@ -499,6 +501,105 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } +/** + * bitmap_scatter - Scatter a bitmap according to the given mask + * @dst: scattered bitmap + * @src: gathered bitmap + * @mask: mask representing bits to assign to in the scattered bitmap + * @nbits: number of bits in each of these bitmaps + * + * Scatters bitmap with sequential bits according to the given @mask. + * + * Example: + * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302. + * + * Or in binary form + * @src @mask @dst + * 0000000001011010 0001001100010011 0000001100000010 + * + * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12) + * + * A more 'visual' description of the operation: + * src: 0000000001011010 + * |||||| + * +------+||||| + * | +----+|||| + * | |+----+||| + * | || +-+|| + * | || | || + * mask: ...v..vv...v..vv + * ...0..11...0..10 + * dst: 0000001100000010 + * + * A relationship exists between bitmap_scatter() and bitmap_gather(). + * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. + * See bitmap_scatter() for details related to this relationship. + */ +static inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, + const unsigned long *mask, unsigned int nbits) +{ + unsigned int n = 0; + unsigned int bit; + + bitmap_zero(dst, nbits); + + for_each_set_bit(bit, mask, nbits) + __assign_bit(bit, dst, test_bit(n++, src)); +} + +/** + * bitmap_gather - Gather a bitmap according to given mask + * @dst: gathered bitmap + * @src: scattered bitmap + * @mask: mask representing bits to extract from in the scattered bitmap + * @nbits: number of bits in each of these bitmaps + * + * Gathers bitmap with sparse bits according to the given @mask. + * + * Example: + * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a. + * + * Or in binary form + * @src @mask @dst + * 0000001100000010 0001001100010011 0000000000011010 + * + * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5) + * + * A more 'visual' description of the operation: + * mask: ...v..vv...v..vv + * src: 0000001100000010 + * ^ ^^ ^ 0 + * | || | 10 + * | || > 010 + * | |+--> 1010 + * | +--> 11010 + * +----> 011010 + * dst: 0000000000011010 + * + * A relationship exists between bitmap_gather() and bitmap_scatter(). See + * bitmap_scatter() for the bitmap scatter detailed operations. + * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n). + * The operation bitmap_gather(result, scattered, mask, n) leads to a result + * equal or equivalent to src. + * + * The result can be 'equivalent' because bitmap_scatter() and bitmap_gather() + * are not bijective. + * The result and src values are equivalent in that sense that a call to + * bitmap_scatter(res, src, mask, n) and a call to + * bitmap_scatter(res, result, mask, n) will lead to the same res value. + */ +static inline void bitmap_gather(unsigned long *dst, const unsigned long *src, + const unsigned long *mask, unsigned int nbits) +{ + unsigned int n = 0; + unsigned int bit; + + bitmap_zero(dst, nbits); + + for_each_set_bit(bit, mask, nbits) + __assign_bit(n++, dst, test_bit(bit, src)); +} + static inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) -- cgit v1.2.3 From cb8a2ef0848ca80d67d6d56e2df757cfdf6b3355 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Mar 2024 22:23:47 +0800 Subject: LoongArch: Add ORC stack unwinder support The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is similar in concept to a DWARF unwinder. The difference is that the format of the ORC data is much simpler than DWARF, which in turn allows the ORC unwinder to be much simpler and faster. The ORC data consists of unwind tables which are generated by objtool. After analyzing all the code paths of a .o file, it determines information about the stack state at each instruction address in the file and outputs that information to the .orc_unwind and .orc_unwind_ip sections. The per-object ORC sections are combined at link time and are sorted and post-processed at boot time. The unwinder uses the resulting data to correlate instruction addresses with their stack states at run time. Most of the logic are similar with x86, in order to get ra info before ra is saved into stack, add ra_reg and ra_offset into orc_entry. At the same time, modify some arch-specific code to silence the objtool warnings. Co-developed-by: Jinyang He Signed-off-by: Jinyang He Co-developed-by: Youling Tang Signed-off-by: Youling Tang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- include/linux/compiler.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index bb1339c7057b..39f2d4a05208 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -116,6 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define __stringify_label(n) #n +#define __annotate_reachable(c) ({ \ + asm volatile(__stringify_label(c) ":\n\t" \ + ".pushsection .discard.reachable\n\t" \ + ".long " __stringify_label(c) "b - .\n\t" \ + ".popsection\n\t"); \ +}) +#define annotate_reachable() __annotate_reachable(__COUNTER__) + #define __annotate_unreachable(c) ({ \ asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ @@ -128,6 +136,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __annotate_jump_table __section(".rodata..c_jump_table") #else /* !CONFIG_OBJTOOL */ +#define annotate_reachable() #define annotate_unreachable() #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ -- cgit v1.2.3 From d7bca9199a27b8690ae1c71dc11f825154af7234 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 8 Mar 2024 09:12:54 -0800 Subject: mm: Introduce vmap_page_range() to map pages in PCI address space ioremap_page_range() should be used for ranges within vmalloc range only. The vmalloc ranges are allocated by get_vm_area(). PCI has "resource" allocator that manages PCI_IOBASE, IO_SPACE_LIMIT address range, hence introduce vmap_page_range() to be used exclusively to map pages in PCI address space. Fixes: 3e49a866c9dc ("mm: Enforce VM_IOREMAP flag and range in ioremap_page_range.") Reported-by: Miguel Ojeda Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Christoph Hellwig Tested-by: Miguel Ojeda Link: https://lore.kernel.org/bpf/CANiq72ka4rir+RTN2FQoT=Vvprp_Ao-CvoYEkSNqtSY+RZj+AA@mail.gmail.com --- include/linux/io.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index 7304f2a69960..235ba7d80a8f 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -23,12 +23,19 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count); #ifdef CONFIG_MMU int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot); +int vmap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot); #else static inline int ioremap_page_range(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { return 0; } +static inline int vmap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + return 0; +} #endif /* -- cgit v1.2.3 From b620ecbd17a03cacd06f014a5d3f3a11285ce053 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 8 Mar 2024 16:05:24 -0700 Subject: vfio: Introduce interface to flush virqfd inject workqueue In order to synchronize changes that can affect the thread callback, introduce an interface to force a flush of the inject workqueue. The irqfd pointer is only valid under spinlock, but the workqueue cannot be flushed under spinlock. Therefore the flush work for the irqfd is queued under spinlock. The vfio_irqfd_cleanup_wq workqueue is re-used for queuing this work such that flushing the workqueue is also ordered relative to shutdown. Reviewed-by: Kevin Tian Reviewed-by: Reinette Chatre Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20240308230557.805580-4-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 89b265bc6ec3..8b1a29820409 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -356,6 +356,7 @@ struct virqfd { wait_queue_entry_t wait; poll_table pt; struct work_struct shutdown; + struct work_struct flush_inject; struct virqfd **pvirqfd; }; @@ -363,5 +364,6 @@ int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *), void (*thread)(void *, void *), void *data, struct virqfd **pvirqfd, int fd); void vfio_virqfd_disable(struct virqfd **pvirqfd); +void vfio_virqfd_flush_thread(struct virqfd **pvirqfd); #endif /* VFIO_H */ -- cgit v1.2.3 From 8076fcde016c9c0e0660543e67bff86cb48a7c9c Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 11 Mar 2024 12:29:43 -0700 Subject: x86/rfds: Mitigate Register File Data Sampling (RFDS) RFDS is a CPU vulnerability that may allow userspace to infer kernel stale data previously used in floating point registers, vector registers and integer registers. RFDS only affects certain Intel Atom processors. Intel released a microcode update that uses VERW instruction to clear the affected CPU buffers. Unlike MDS, none of the affected cores support SMT. Add RFDS bug infrastructure and enable the VERW based mitigation by default, that clears the affected buffers just before exiting to userspace. Also add sysfs reporting and cmdline parameter "reg_file_data_sampling" to control the mitigation. For details see: Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Acked-by: Josh Poimboeuf --- include/linux/cpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index dcb89c987164..8654714421a0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -75,6 +75,8 @@ extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From e5b7aefe38f7f6258935d8a10c36552dd957048a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Mar 2024 10:22:30 +0000 Subject: net: gro: move two declarations to include/net/gro.h Move gro_find_receive_by_type() and gro_find_complete_by_type() to include/net/gro.h where they belong. Also use _NET_GRO_H instead of _NET_IPV6_GRO_H to protect include/net/gro.h from multiple inclusions. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240308102230.296224-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4230c7f3b959..c6f6ac779b34 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3901,8 +3901,6 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); -struct packet_offload *gro_find_receive_by_type(__be16 type); -struct packet_offload *gro_find_complete_by_type(__be16 type); static inline void napi_free_frags(struct napi_struct *napi) { -- cgit v1.2.3 From 317460317a02a1af512697e6e964298dedd8a163 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:07:59 -0800 Subject: bpf: Introduce bpf_arena. Introduce bpf_arena, which is a sparse shared memory region between the bpf program and user space. Use cases: 1. User space mmap-s bpf_arena and uses it as a traditional mmap-ed anonymous region, like memcached or any key/value storage. The bpf program implements an in-kernel accelerator. XDP prog can search for a key in bpf_arena and return a value without going to user space. 2. The bpf program builds arbitrary data structures in bpf_arena (hash tables, rb-trees, sparse arrays), while user space consumes it. 3. bpf_arena is a "heap" of memory from the bpf program's point of view. The user space may mmap it, but bpf program will not convert pointers to user base at run-time to improve bpf program speed. Initially, the kernel vm_area and user vma are not populated. User space can fault in pages within the range. While servicing a page fault, bpf_arena logic will insert a new page into the kernel and user vmas. The bpf program can allocate pages from that region via bpf_arena_alloc_pages(). This kernel function will insert pages into the kernel vm_area. The subsequent fault-in from user space will populate that page into the user vma. The BPF_F_SEGV_ON_FAULT flag at arena creation time can be used to prevent fault-in from user space. In such a case, if a page is not allocated by the bpf program and not present in the kernel vm_area, the user process will segfault. This is useful for use cases 2 and 3 above. bpf_arena_alloc_pages() is similar to user space mmap(). It allocates pages either at a specific address within the arena or allocates a range with the maple tree. bpf_arena_free_pages() is analogous to munmap(), which frees pages and removes the range from the kernel vm_area and from user process vmas. bpf_arena can be used as a bpf program "heap" of up to 4GB. The speed of bpf program is more important than ease of sharing with user space. This is use case 3. In such a case, the BPF_F_NO_USER_CONV flag is recommended. It will tell the verifier to treat the rX = bpf_arena_cast_user(rY) instruction as a 32-bit move wX = wY, which will improve bpf prog performance. Otherwise, bpf_arena_cast_user is translated by JIT to conditionally add the upper 32 bits of user vm_start (if the pointer is not NULL) to arena pointers before they are stored into memory. This way, user space sees them as valid 64-bit pointers. Diff https://github.com/llvm/llvm-project/pull/84410 enables LLVM BPF backend generate the bpf_addr_space_cast() instruction to cast pointers between address_space(1) which is reserved for bpf_arena pointers and default address space zero. All arena pointers in a bpf program written in C language are tagged as __attribute__((address_space(1))). Hence, clang provides helpful diagnostics when pointers cross address space. Libbpf and the kernel support only address_space == 1. All other address space identifiers are reserved. rX = bpf_addr_space_cast(rY, /* dst_as */ 1, /* src_as */ 0) tells the verifier that rX->type = PTR_TO_ARENA. Any further operations on PTR_TO_ARENA register have to be in the 32-bit domain. The verifier will mark load/store through PTR_TO_ARENA with PROBE_MEM32. JIT will generate them as kern_vm_start + 32bit_addr memory accesses. The behavior is similar to copy_from_kernel_nofault() except that no address checks are necessary. The address is guaranteed to be in the 4GB range. If the page is not present, the destination register is zeroed on read, and the operation is ignored on write. rX = bpf_addr_space_cast(rY, 0, 1) tells the verifier that rX->type = unknown scalar. If arena->map_flags has BPF_F_NO_USER_CONV set, then the verifier converts such cast instructions to mov32. Otherwise, JIT will emit native code equivalent to: rX = (u32)rY; if (rY) rX |= clear_lo32_bits(arena->user_vm_start); /* replace hi32 bits in rX */ After such conversion, the pointer becomes a valid user pointer within bpf_arena range. The user process can access data structures created in bpf_arena without any additional computations. For example, a linked list built by a bpf program can be walked natively by user space. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Reviewed-by: Barret Rhoden Link: https://lore.kernel.org/bpf/20240308010812.89848-2-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 7 +++++-- include/linux/bpf_types.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 95e07673cdc1..ea6ab6e0eef9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -37,6 +37,7 @@ struct perf_event; struct bpf_prog; struct bpf_prog_aux; struct bpf_map; +struct bpf_arena; struct sock; struct seq_file; struct btf; @@ -528,8 +529,8 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock); - - +u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena); +u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; @@ -2215,6 +2216,8 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); +int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, + unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG_KMEM void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 94baced5a1ad..9f2a6b83b49e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -132,6 +132,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) -- cgit v1.2.3 From 2fe99eb0ccf2bb73df65ebcbbf2f2ff70e63547b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:01 -0800 Subject: bpf: Add x86-64 JIT support for PROBE_MEM32 pseudo instructions. Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW] instructions. They are similar to PROBE_MEM instructions with the following differences: - PROBE_MEM has to check that the address is in the kernel range with src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE check - PROBE_MEM doesn't support store - PROBE_MEM32 relies on the verifier to clear upper 32-bit in the register - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in %r12 in the prologue) Due to bpf_arena constructions such %r12 + %reg + off16 access is guaranteed to be within arena virtual range, so no address check at run-time. - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When LDX faults the destination register is zeroed. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240308010812.89848-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + include/linux/filter.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea6ab6e0eef9..8904d1606125 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1458,6 +1458,7 @@ struct bpf_prog_aux { bool xdp_has_frags; bool exception_cb; bool exception_boundary; + struct bpf_arena *arena; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 36cc29a2934c..b119f04ecb0b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -72,6 +72,9 @@ struct ctl_table_header; /* unused opcode to mark special ldsx instruction. Same as BPF_IND */ #define BPF_PROBE_MEMSX 0x40 +/* unused opcode to mark special load instruction. Same as BPF_MSH */ +#define BPF_PROBE_MEM32 0xa0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 -- cgit v1.2.3 From 142fd4d2dcf58b1720a6af644f31de1a5551f219 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:02 -0800 Subject: bpf: Add x86-64 JIT support for bpf_addr_space_cast instruction. LLVM generates bpf_addr_space_cast instruction while translating pointers between native (zero) address space and __attribute__((address_space(N))). The addr_space=1 is reserved as bpf_arena address space. rY = addr_space_cast(rX, 0, 1) is processed by the verifier and converted to normal 32-bit move: wX = wY rY = addr_space_cast(rX, 1, 0) has to be converted by JIT: aux_reg = upper_32_bits of arena->user_vm_start aux_reg <<= 32 wX = wY // clear upper 32 bits of dst register if (wX) // if not zero add upper bits of user_vm_start wX |= aux_reg JIT can do it more efficiently: mov dst_reg32, src_reg32 // 32-bit move shl dst_reg, 32 or dst_reg, user_vm_start rol dst_reg, 32 xor r11, r11 test dst_reg32, dst_reg32 // check if lower 32-bit are zero cmove r11, dst_reg // if so, set dst_reg to zero // Intel swapped src/dst register encoding in CMOVcc Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20240308010812.89848-5-alexei.starovoitov@gmail.com --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index b119f04ecb0b..c99bc3df2d28 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -962,6 +962,7 @@ bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); +bool bpf_jit_supports_arena(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); -- cgit v1.2.3 From 6082b6c328b5486da2b356eae94b8b83c98b5565 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:03 -0800 Subject: bpf: Recognize addr_space_cast instruction in the verifier. rY = addr_space_cast(rX, 0, 1) tells the verifier that rY->type = PTR_TO_ARENA. Any further operations on PTR_TO_ARENA register have to be in 32-bit domain. The verifier will mark load/store through PTR_TO_ARENA with PROBE_MEM32. JIT will generate them as kern_vm_start + 32bit_addr memory accesses. rY = addr_space_cast(rX, 1, 0) tells the verifier that rY->type = unknown scalar. If arena->map_flags has BPF_F_NO_USER_CONV set then convert cast_user to mov32 as well. Otherwise JIT will convert it to: rY = (u32)rX; if (rY) rY |= arena->user_vm_start & ~(u64)~0U; Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240308010812.89848-6-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8904d1606125..d0c836ba009d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -883,6 +883,7 @@ enum bpf_reg_type { * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ + PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4b0f6600e499..7cb1b75eee38 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -548,6 +548,7 @@ struct bpf_insn_aux_data { u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ + bool needs_zext; /* alu op needs to clear upper bits */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ bool is_iter_next; /* bpf_iter__next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ -- cgit v1.2.3 From 2edc3de6fb650924a87fffebebc3b7572cbf6e38 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Mar 2024 17:08:04 -0800 Subject: bpf: Recognize btf_decl_tag("arg: Arena") as PTR_TO_ARENA. In global bpf functions recognize btf_decl_tag("arg:arena") as PTR_TO_ARENA. Note, when the verifier sees: __weak void foo(struct bar *p) it recognizes 'p' as PTR_TO_MEM and 'struct bar' has to be a struct with scalars. Hence the only way to use arena pointers in global functions is to tag them with "arg:arena". Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240308010812.89848-7-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0c836ba009d..08ad265cb195 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -712,6 +712,7 @@ enum bpf_arg_type { * on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ + ARG_PTR_TO_ARENA, ARG_CONST_SIZE, /* number of bytes accessed from memory */ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ -- cgit v1.2.3 From 66c8473135c62f478301a0e5b3012f203562dfa6 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Mar 2024 16:47:39 -0800 Subject: bpf: move sleepable flag from bpf_prog_aux to bpf_prog prog->aux->sleepable is checked very frequently as part of (some) BPF program run hot paths. So this extra aux indirection seems wasteful and on busy systems might cause unnecessary memory cache misses. Let's move sleepable flag into prog itself to eliminate unnecessary pointer dereference. Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Message-ID: <20240309004739.2961431-1-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 08ad265cb195..4f20f62f9d63 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1455,7 +1455,6 @@ struct bpf_prog_aux { bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; - bool sleepable; bool tail_call_reachable; bool xdp_has_frags; bool exception_cb; @@ -1541,7 +1540,8 @@ struct bpf_prog { enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ - tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ + tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */ + sleepable:1; /* BPF program is sleepable */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ @@ -2112,14 +2112,14 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { - if (!prog->aux->sleepable) + if (!prog->sleepable) rcu_read_lock(); run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; - if (!prog->aux->sleepable) + if (!prog->sleepable) rcu_read_unlock(); } bpf_reset_run_ctx(old_run_ctx); -- cgit v1.2.3 From a66ccfc2535418b536b1203b65f87c4f501f6bdd Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 26 Feb 2024 20:35:56 +0100 Subject: platform/x86: wmi: Do not instantiate older WMI drivers multiple times MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many older WMI drivers cannot be instantiated multiple times for two reasons: - they are using the legacy GUID-based WMI API - they are singletons (with global state) Prevent such WMI drivers from binding to WMI devices with a duplicated GUID, as this would mean that the WMI driver will be instantiated at least two times (one for the original GUID and one for the duplicated GUID). WMI drivers which can be instantiated multiple times can signal this by setting a flag inside struct wmi_driver. Tested on a ASUS Prime B650-Plus. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20240226193557.2888-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 781958310bfb..63cca3b58d6d 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -49,6 +49,7 @@ u8 wmidev_instance_count(struct wmi_device *wdev); * @driver: Driver model structure * @id_table: List of WMI GUIDs supported by this driver * @no_notify_data: Driver supports WMI events which provide no event data + * @no_singleton: Driver can be instantiated multiple times * @probe: Callback for device binding * @remove: Callback for device unbinding * @notify: Callback for receiving WMI events @@ -59,6 +60,7 @@ struct wmi_driver { struct device_driver driver; const struct wmi_device_id *id_table; bool no_notify_data; + bool no_singleton; int (*probe)(struct wmi_device *wdev, const void *context); void (*remove)(struct wmi_device *wdev); -- cgit v1.2.3 From dbab9afe8640a51ffcce87bfdb59a814e0dc7780 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 5 Mar 2024 11:59:11 +0100 Subject: clk: x86: Move clk-pmc-atom register defines to include/linux/platform_data/x86/pmc_atom.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the register defines for the Atom (Bay Trail, Cherry Trail) PMC clocks to include/linux/platform_data/x86/pmc_atom.h. This is a preparation patch to extend the S0i3 readiness checks in drivers/platform/x86/pmc_atom.c with checking that the PMC clocks are off on suspend entry. Note these are added to include/linux/platform_data/x86/pmc_atom.h rather then to include/linux/platform_data/x86/clk-pmc-atom.h because the former already has all the other Atom PMC register defines. Reviewed-by: Ilpo Järvinen Acked-by: Stephen Boyd Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240305105915.76242-2-hdegoede@redhat.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/pmc_atom.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h index b8a701c77fd0..557622ef0390 100644 --- a/include/linux/platform_data/x86/pmc_atom.h +++ b/include/linux/platform_data/x86/pmc_atom.h @@ -43,6 +43,19 @@ BIT_ORED_DEDICATED_IRQ_GPSC | \ BIT_SHARED_IRQ_GPSS) +/* External clk generator settings */ +#define PMC_CLK_CTL_OFFSET 0x60 +#define PMC_CLK_CTL_SIZE 4 +#define PMC_CLK_NUM 6 +#define PMC_CLK_CTL_GATED_ON_D3 0x0 +#define PMC_CLK_CTL_FORCE_ON 0x1 +#define PMC_CLK_CTL_FORCE_OFF 0x2 +#define PMC_CLK_CTL_RESERVED 0x3 +#define PMC_MASK_CLK_CTL GENMASK(1, 0) +#define PMC_MASK_CLK_FREQ BIT(2) +#define PMC_CLK_FREQ_XTAL (0 << 2) /* 25 MHz */ +#define PMC_CLK_FREQ_PLL (1 << 2) /* 19.2 MHz */ + /* The timers accumulate time spent in sleep state */ #define PMC_S0IR_TMR 0x80 #define PMC_S0I1_TMR 0x84 -- cgit v1.2.3 From a21ff5a0a7948b6ef1364f8f6d07eda49426d09a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 5 Mar 2024 11:59:12 +0100 Subject: platform/x86: pmc_atom: Annotate d3_sts register bit defines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The include/linux/platform_data/x86/pmc_atom.h d3_sts register bit defines are named after how these bits are used on Bay Trail devices. On Cherry Trail (CHT) devices some of these bits have a different meaning according to the datasheet. At a comment to the defines for bits which have a different meaning on Cherry Trail devices. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20240305105915.76242-3-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/pmc_atom.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h index 557622ef0390..161e4bc1c9ee 100644 --- a/include/linux/platform_data/x86/pmc_atom.h +++ b/include/linux/platform_data/x86/pmc_atom.h @@ -117,14 +117,14 @@ #define BIT_SCC_SDIO BIT(9) #define BIT_SCC_SDCARD BIT(10) #define BIT_SCC_MIPI BIT(11) -#define BIT_HDA BIT(12) +#define BIT_HDA BIT(12) /* CHT datasheet: reserved */ #define BIT_LPE BIT(13) #define BIT_OTG BIT(14) -#define BIT_USH BIT(15) -#define BIT_GBE BIT(16) -#define BIT_SATA BIT(17) -#define BIT_USB_EHCI BIT(18) -#define BIT_SEC BIT(19) +#define BIT_USH BIT(15) /* CHT datasheet: reserved */ +#define BIT_GBE BIT(16) /* CHT datasheet: reserved */ +#define BIT_SATA BIT(17) /* CHT datasheet: reserved */ +#define BIT_USB_EHCI BIT(18) /* CHT datasheet: XHCI! */ +#define BIT_SEC BIT(19) /* BYT datasheet: reserved */ #define BIT_PCIE_PORT0 BIT(20) #define BIT_PCIE_PORT1 BIT(21) #define BIT_PCIE_PORT2 BIT(22) -- cgit v1.2.3 From fa63587f94a77a49b53274dc0fd1ea41dfde5966 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 11 Mar 2024 14:32:23 +0100 Subject: drivers/gpio/nomadik: move dummy nmk_gpio_dbg_show_one() to header When `CONFIG_DEBUG_FS` is disabled, nmk_gpio_dbg_show_one() is an empty dummy function; this however triggers a `-Wmissing-prototypes` warning and later a linker error because the function is also used by drivers/pinctrl/nomadik/pinctrl-nomadik.c, therefore it needs to be non-static. To allow both sources to access this dummy function, this patch moves it to the header, adding the `#ifdef CONFIG_DEBUG_FS` there as well. Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/20240311133223.3429428-1-max.kellermann@ionos.com Signed-off-by: Linus Walleij --- include/linux/gpio/gpio-nomadik.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index 4a95ea7935fb..b5a84864650d 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -253,6 +253,8 @@ nmk_pinctrl_db8540_init(const struct nmk_pinctrl_soc_data **soc) struct platform_device; +#ifdef CONFIG_DEBUG_FS + /* * Symbols declared in gpio-nomadik used by pinctrl-nomadik. If pinctrl-nomadik * is enabled, then gpio-nomadik is enabled as well; the reverse if not always @@ -261,6 +263,19 @@ struct platform_device; void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, struct gpio_chip *chip, unsigned int offset, unsigned int gpio); + +#else + +static inline void nmk_gpio_dbg_show_one(struct seq_file *s, + struct pinctrl_dev *pctldev, + struct gpio_chip *chip, + unsigned int offset, + unsigned int gpio) +{ +} + +#endif + void __nmk_gpio_make_output(struct nmk_gpio_chip *nmk_chip, unsigned int offset, int val); void __nmk_gpio_set_slpm(struct nmk_gpio_chip *nmk_chip, unsigned int offset, -- cgit v1.2.3 From 7af9ded0c2caac0a95f33df5cb04706b0f502588 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Mar 2024 08:15:07 -0400 Subject: ring-buffer: Use wait_event_interruptible() in ring_buffer_wait() Convert ring_buffer_wait() over to wait_event_interruptible(). The default condition is to execute the wait loop inside __wait_event() just once. This does not change the ring_buffer_wait() prototype yet, but restructures the code so that it can take a "cond" and "data" parameter and will call wait_event_interruptible() with a helper function as the condition. The helper function (rb_wait_cond) takes the cond function and data parameters. It will first check if the buffer hit the watermark defined by the "full" parameter and then call the passed in condition parameter. If either are true, it returns true. If rb_wait_cond() does not return true, it will set the appropriate "waiters_pending" flag and returns false. Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wgsNgewHFxZAJiAQznwPMqEtQmi1waeS2O1v6L4c_Um5A@mail.gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240312121703.399598519@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Linus Torvalds Cc: linke li Cc: Rabin Vincent Fixes: f3ddb74ad0790 ("tracing: Wake up ring buffer waiters on closing of the file") Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index fa802db216f9..338a33db1577 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -98,6 +98,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) +typedef bool (*ring_buffer_cond_fn)(void *data); int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full); __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full); -- cgit v1.2.3 From 2aa043a55b9a764c9cbde5a8c654eeaaffe224cf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Mar 2024 08:15:08 -0400 Subject: tracing/ring-buffer: Fix wait_on_pipe() race When the trace_pipe_raw file is closed, there should be no new readers on the file descriptor. This is mostly handled with the waking and wait_index fields of the iterator. But there's still a slight race. CPU 0 CPU 1 ----- ----- wait_index++; index = wait_index; ring_buffer_wake_waiters(); wait_on_pipe() ring_buffer_wait(); The ring_buffer_wait() will miss the wakeup from CPU 1. The problem is that the ring_buffer_wait() needs the logic of: prepare_to_wait(); if (!condition) schedule(); Where the missing condition check is the iter->wait_index update. Have the ring_buffer_wait() take a conditional callback function and a data parameter that can be used within the wait_event_interruptible() of the ring_buffer_wait() function. In wait_on_pipe(), pass a condition function that will check if the wait_index has been updated, if it has, it will return true to break out of the wait_event_interruptible() loop. Create a new field "closed" in the trace_iterator and set it in the .flush() callback before calling ring_buffer_wake_waiters(). This will keep any new readers from waiting on a closed file descriptor. Have the wait_on_pipe() condition callback also check the closed field. Change the wait_index field of the trace_iterator to atomic_t. There's no reason it needs to be 'long' and making it atomic and using atomic_read_acquire() and atomic_fetch_inc_release() will provide the necessary memory barriers. Add a "woken" flag to tracing_buffers_splice_read() to exit the loop after one more try to fetch data. That is, if it waited for data and something woke it up, it should try to collect any new data and then exit back to user space. Link: https://lore.kernel.org/linux-trace-kernel/CAHk-=wgsNgewHFxZAJiAQznwPMqEtQmi1waeS2O1v6L4c_Um5A@mail.gmail.com/ Link: https://lore.kernel.org/linux-trace-kernel/20240312121703.557950713@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Linus Torvalds Cc: linke li Cc: Rabin Vincent Fixes: f3ddb74ad0790 ("tracing: Wake up ring buffer waiters on closing of the file") Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 ++- include/linux/trace_events.h | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 338a33db1577..dc5ae4e96aee 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -99,7 +99,8 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k }) typedef bool (*ring_buffer_cond_fn)(void *data); -int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full); +int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, + ring_buffer_cond_fn cond, void *data); __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full); void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index d68ff9b1247f..fc6d0af56bb1 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -103,13 +103,16 @@ struct trace_iterator { unsigned int temp_size; char *fmt; /* modified format holder */ unsigned int fmt_size; - long wait_index; + atomic_t wait_index; /* trace_seq for __print_flags() and __print_symbolic() etc. */ struct trace_seq tmp_seq; cpumask_var_t started; + /* Set when the file is closed to prevent new waiters */ + bool closed; + /* it's true when current open file is snapshot */ bool snapshot; -- cgit v1.2.3 From 17423360a27ae58c1850f588bdd8013bbfcd250b Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Fri, 23 Feb 2024 14:58:50 -0600 Subject: PCI/ASPM: Save L1 PM Substates Capability for suspend/resume 4ff116d0d5fd ("PCI/ASPM: Save L1 PM Substates Capability for suspend/resume") restored the L1 PM Substates Capability after resume, which reduced power consumption by making the ASPM L1.x states work after resume. a7152be79b62 ("Revert "PCI/ASPM: Save L1 PM Substates Capability for suspend/resume"") reverted 4ff116d0d5fd because resume failed on some systems, so power consumption after resume increased again. a7152be79b62 mentioned that we restore L1 PM substate configuration even though ASPM L1 may already be enabled. This is due the fact that the pci_restore_aspm_l1ss_state() was called before pci_restore_pcie_state(). Save and restore the L1 PM Substates Capability, following PCIe r6.1, sec 5.5.4 more closely by: 1) Do not restore ASPM configuration in pci_restore_pcie_state() but do that after PCIe capability is restored in pci_restore_aspm_state() following PCIe r6.1, sec 5.5.4. 2) If BIOS reenables L1SS, particularly L1.2, we need to clear the enables in the right order, downstream before upstream. Defer restoring the L1SS config until we are at the downstream component. Then update the config for both ends of the link in the prescribed order. 3) Program ASPM L1 PM substate configuration before L1 enables. 4) Program ASPM L1 PM substate enables last, after rest of the fields in the capability are programmed. [bhelgaas: commit log, squash L1SS-related patches, do both LNKCTL restores in pci_restore_pcie_state()] Link: https://lore.kernel.org/r/20240128233212.1139663-3-david.e.box@linux.intel.com Link: https://lore.kernel.org/r/20240128233212.1139663-4-david.e.box@linux.intel.com Link: https://lore.kernel.org/r/20240223205851.114931-5-helgaas@kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217321 Link: https://bugzilla.kernel.org/show_bug.cgi?id=216782 Link: https://bugzilla.kernel.org/show_bug.cgi?id=216877 Co-developed-by: Mika Westerberg Co-developed-by: David E. Box Reported-by: Koba Ko Signed-off-by: Mika Westerberg Signed-off-by: David E. Box Signed-off-by: Bjorn Helgaas Tested-by: Tasev Nikola # Asus UX305FA Cc: Mark Enriquez Cc: Thomas Witt Cc: Werner Sembach Cc: Vidya Sagar --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index add9368e6314..6967ae7b4115 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -390,9 +390,9 @@ struct pci_dev { unsigned int d3hot_delay; /* D3hot->D0 transition time in ms */ unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */ + u16 l1ss; /* L1SS Capability pointer */ #ifdef CONFIG_PCIEASPM struct pcie_link_state *link_state; /* ASPM link state */ - u16 l1ss; /* L1SS Capability pointer */ unsigned int ltr_path:1; /* Latency Tolerance Reporting supported from root to here */ #endif -- cgit v1.2.3 From 11270e526276ffad4c4237acb393da82a3287487 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 8 Mar 2024 14:59:21 -0700 Subject: base/node / ACPI: Enumerate node access class for 'struct access_coordinate' Both generic node and HMAT handling code have been using magic numbers to indicate access classes for 'struct access_coordinate'. Introduce enums to enumerate the access0 and access1 classes shared by the two subsystems. Update the function parameters and callers as appropriate to utilize the new enum. Access0 is named to ACCESS_COORDINATE_LOCAL in order to indicate that the access class is for 'struct access_coordinate' between a target node and the nearest initiator node. Access1 is named to ACCESS_COORDINATE_CPU in order to indicate that the access class is for 'struct access_coordinate' between a target node and the nearest CPU node. Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron Acked-by: Greg Kroah-Hartman Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240308220055.2172956-3-dave.jiang@intel.com Signed-off-by: Dan Williams --- include/linux/node.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/node.h b/include/linux/node.h index 25b66d705ee2..dfc004e4bee7 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -34,6 +34,18 @@ struct access_coordinate { unsigned int write_latency; }; +/* + * ACCESS_COORDINATE_LOCAL correlates to ACCESS CLASS 0 + * - access_coordinate between target node and nearest initiator node + * ACCESS_COORDINATE_CPU correlates to ACCESS CLASS 1 + * - access_coordinate between target node and nearest CPU node + */ +enum access_coordinate_class { + ACCESS_COORDINATE_LOCAL, + ACCESS_COORDINATE_CPU, + ACCESS_COORDINATE_MAX +}; + enum cache_indexing { NODE_CACHE_DIRECT_MAP, NODE_CACHE_INDEXED, @@ -66,7 +78,7 @@ struct node_cache_attrs { #ifdef CONFIG_HMEM_REPORTING void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs); void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, - unsigned access); + enum access_coordinate_class access); #else static inline void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs) @@ -75,7 +87,7 @@ static inline void node_add_cache(unsigned int nid, static inline void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, - unsigned access) + enum access_coordinate_class access) { } #endif @@ -137,7 +149,7 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, - unsigned access); + enum access_coordinate_class access); #else static inline void node_dev_init(void) { -- cgit v1.2.3 From 067353a46d8ccdac279ebab97c038c3658e97541 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 8 Mar 2024 14:59:30 -0700 Subject: cxl/region: Add memory hotplug notifier for cxl region When the CXL region is formed, the driver computes the performance data for the region. However this data is not available at the node data collection that has been populated by the HMAT during kernel initialization. Add a memory hotplug notifier to update the access coordinates to the 'struct memory_target' context kept by the HMAT_REPORTING code. Add CXL_CALLBACK_PRI for a memory hotplug callback priority. Set the priority number to be called before HMAT_CALLBACK_PRI. The CXL update must happen before hmat_callback(). A new HMAT_REPORTING helper hmat_update_target_coordinates() is added in order to allow CXL to update the memory_target access coordinates. A new ext_updated member is added to the memory_target to indicate that the access coordinates within the memory_target has been updated by an external agent such as CXL. This prevents data being overwritten by the hmat_update_target_attrs() triggered by hmat_callback(). Cc: Andrew Morton Cc: Rafael J. Wysocki Reviewed-by: Huang, Ying Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240308220055.2172956-12-dave.jiang@intel.com Signed-off-by: Dan Williams --- include/linux/acpi.h | 12 ++++++++++++ include/linux/memory.h | 1 + 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index b7165e52b3c6..c84c2f34b8ee 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1547,4 +1547,16 @@ static inline void acpi_use_parent_companion(struct device *dev) ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent)); } +#ifdef CONFIG_ACPI_HMAT +int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, + enum access_coordinate_class access); +#else +static inline int hmat_update_target_coordinates(int nid, + struct access_coordinate *coord, + enum access_coordinate_class access) +{ + return -EOPNOTSUPP; +} +#endif + #endif /*_LINUX_ACPI_H*/ diff --git a/include/linux/memory.h b/include/linux/memory.h index f53cfdaaaa41..d8588256578a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -114,6 +114,7 @@ struct mem_section; #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 #define HMAT_CALLBACK_PRI 2 +#define CXL_CALLBACK_PRI 5 #define MM_COMPUTE_BATCH_PRI 10 #define CPUSET_CALLBACK_PRI 10 #define MEMTIER_HOTPLUG_PRI 100 -- cgit v1.2.3 From 75060b6ead0e93b7b43a451ba1e13c49b1aa2025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 6 Mar 2024 07:49:16 +0100 Subject: watchdog/core: remove sysctl handlers from public header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions are only used in the file where they are defined. Remove them from the header and make them static. Also guard proc_soft_watchdog with a #define-guard as it is not used otherwise. Link: https://lkml.kernel.org/r/20240306-const-sysctl-prep-watchdog-v1-1-bd45da3a41cf@weissschuh.net Signed-off-by: Thomas Weißschuh Signed-off-by: Andrew Morton --- include/linux/nmi.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index e92e378df000..f53438eae815 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -216,13 +216,6 @@ void watchdog_update_hrtimer_threshold(u64 period); static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif -struct ctl_table; -int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); -int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); -int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *); -int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *); - #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include #endif -- cgit v1.2.3 From debdce20c4f28b7e5aa48512e7abf270a00e9051 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 8 Mar 2024 14:59:31 -0700 Subject: cxl/region: Deal with numa nodes not enumerated by SRAT For the numa nodes that are not created by SRAT, no memory_target is allocated and is not managed by the HMAT_REPORTING code. Therefore hmat_callback() memory hotplug notifier will exit early on those NUMA nodes. The CXL memory hotplug notifier will need to call node_set_perf_attrs() directly in order to setup the access sysfs attributes. In acpi_numa_init(), the last proximity domain (pxm) id created by SRAT is stored. Add a helper function acpi_node_backed_by_real_pxm() in order to check if a NUMA node id is defined by SRAT or created by CFMWS. node_set_perf_attrs() symbol is exported to allow update of perf attribs for a node. The sysfs path of /sys/devices/system/node/nodeX/access0/initiators/* is created by node_set_perf_attrs() for the various attributes where nodeX is matched to the NUMA node of the CXL region. Cc: Rafael J. Wysocki Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20240308220055.2172956-13-dave.jiang@intel.com Signed-off-by: Dan Williams --- include/linux/acpi.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c84c2f34b8ee..2a7c4b90d589 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1559,4 +1559,13 @@ static inline int hmat_update_target_coordinates(int nid, } #endif +#ifdef CONFIG_ACPI_NUMA +bool acpi_node_backed_by_real_pxm(int nid); +#else +static inline bool acpi_node_backed_by_real_pxm(int nid) +{ + return false; +} +#endif + #endif /*_LINUX_ACPI_H*/ -- cgit v1.2.3 From 9f0c4a46be1fe9b97dbe66d49204c1371e3ece65 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 Mar 2024 09:08:34 +0800 Subject: f2fs: fix to truncate meta inode pages forcely Below race case can cause data corruption: Thread A GC thread - gc_data_segment - ra_data_block - locked meta_inode page - f2fs_inplace_write_data - invalidate_mapping_pages : fail to invalidate meta_inode page due to lock failure or dirty|writeback status - f2fs_submit_page_bio : write last dirty data to old blkaddr - move_data_block - load old data from meta_inode page - f2fs_submit_page_write : write old data to new blkaddr Because invalidate_mapping_pages() will skip invalidating page which has unclear status including locked, dirty, writeback and so on, so we need to use truncate_inode_pages_range() instead of invalidate_mapping_pages() to make sure meta_inode page will be dropped. Fixes: 6aa58d8ad20a ("f2fs: readahead encrypted block during GC") Fixes: e3b49ea36802 ("f2fs: invalidate META_MAPPING before IPU/DIO write") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 755e9a41b196..a357287eac1e 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -27,6 +27,7 @@ #define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS) #define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS) +#define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 -- cgit v1.2.3 From f88c3fb81c4badb46c2fef7d168ff138043e86bb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Mar 2024 20:32:19 -0700 Subject: mm, slab: remove last vestiges of SLAB_MEM_SPREAD Yes, yes, I know the slab people were planning on going slow and letting every subsystem fight this thing on their own. But let's just rip off the band-aid and get it over and done with. I don't want to see a number of unnecessary pull requests just to get rid of a flag that no longer has any meaning. This was mainly done with a couple of 'sed' scripts and then some manual cleanup of the end result. Link: https://lore.kernel.org/all/CAHk-=wji0u+OOtmAOD-5JV3SXcRJF___k_+8XNKmak0yd5vW1Q@mail.gmail.com/ Signed-off-by: Linus Torvalds --- include/linux/slab.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index f6323763cd61..e53cbfa18325 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -202,9 +202,6 @@ enum _slab_flag_bits { #endif #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ -/* Obsolete unused flag, to be removed */ -#define SLAB_MEM_SPREAD __SLAB_FLAG_UNUSED - /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * -- cgit v1.2.3 From 12fb28ea6b1cf24bde27c406eb38ee3c108d73f9 Mon Sep 17 00:00:00 2001 From: Ben Cheatham Date: Mon, 11 Mar 2024 09:25:06 -0500 Subject: EINJ: Add CXL error type support Move CXL protocol error types from einj.c (now einj-core.c) to einj-cxl.c. einj-cxl.c implements the necessary handling for CXL protocol error injection and exposes an API for the CXL core to use said functionality, while also allowing the EINJ module to be built without CXL support. Because CXL error types targeting CXL 1.0/1.1 ports require special handling, only allow them to be injected through the new cxl debugfs interface (next commit) and return an error when attempting to inject through the legacy interface. Reviewed-by: Jonathan Cameron Signed-off-by: Ben Cheatham Link: https://lore.kernel.org/r/20240311142508.31717-3-Benjamin.Cheatham@amd.com Signed-off-by: Dan Williams --- include/linux/einj-cxl.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 include/linux/einj-cxl.h (limited to 'include/linux') diff --git a/include/linux/einj-cxl.h b/include/linux/einj-cxl.h new file mode 100644 index 000000000000..624ff6ff41f9 --- /dev/null +++ b/include/linux/einj-cxl.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * CXL protocol Error INJection support. + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Ben Cheatham + */ +#ifndef EINJ_CXL_H +#define EINJ_CXL_H + +#include +#include + +struct pci_dev; +struct seq_file; + +#if IS_ENABLED(CONFIG_ACPI_APEI_EINJ_CXL) +int einj_cxl_available_error_type_show(struct seq_file *m, void *v); +int einj_cxl_inject_error(struct pci_dev *dport_dev, u64 type); +int einj_cxl_inject_rch_error(u64 rcrb, u64 type); +bool einj_cxl_is_initialized(void); +#else /* !IS_ENABLED(CONFIG_ACPI_APEI_EINJ_CXL) */ +static inline int einj_cxl_available_error_type_show(struct seq_file *m, + void *v) +{ + return -ENXIO; +} + +static inline int einj_cxl_inject_error(struct pci_dev *dport_dev, u64 type) +{ + return -ENXIO; +} + +static inline int einj_cxl_inject_rch_error(u64 rcrb, u64 type) +{ + return -ENXIO; +} + +static inline bool einj_cxl_is_initialized(void) { return false; } +#endif /* CONFIG_ACPI_APEI_EINJ_CXL */ + +#endif /* EINJ_CXL_H */ -- cgit v1.2.3 From c6c3187d66bc4e87086036266def4170742d7214 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sat, 17 Feb 2024 22:39:46 +0100 Subject: lib/firmware_table: Provide buffer length argument to cdat_table_parse() There exist card implementations with a CDAT table using a fixed size buffer, but with entries filled in that do not fill the whole table length size. Then, the last entry in the CDAT table may not mark the end of the CDAT table buffer specified by the length field in the CDAT header. It can be shorter with trailing unused (zero'ed) data. The actual table length is determined while reading all CDAT entries of the table with DOE. If the table is greater than expected (containing zero'ed trailing data), the CDAT parser fails with: [ 48.691717] Malformed DSMAS table length: (24:0) [ 48.702084] [CDAT:0x00] Invalid zero length [ 48.711460] cxl_port endpoint1: Failed to parse CDAT: -22 In addition, a check of the table buffer length is missing to prevent an out-of-bound access then parsing the CDAT table. Hardening code against device returning borked table. Fix that by providing an optional buffer length argument to acpi_parse_entries_array() that can be used by cdat_table_parse() to propagate the buffer size down to its users to check the buffer length. This also prevents a possible out-of-bound access mentioned. Add a check to warn about a malformed CDAT table length. Cc: Rafael J. Wysocki Cc: Len Brown Reviewed-by: Dave Jiang Signed-off-by: Robert Richter Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/ZdEnopFO0Tl3t2O1@rric.localdomain Signed-off-by: Dan Williams --- include/linux/fw_table.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h index 95421860397a..3ff4c277296f 100644 --- a/include/linux/fw_table.h +++ b/include/linux/fw_table.h @@ -40,12 +40,14 @@ union acpi_subtable_headers { int acpi_parse_entries_array(char *id, unsigned long table_size, union fw_table_header *table_header, + unsigned long max_length, struct acpi_subtable_proc *proc, int proc_num, unsigned int max_entries); int cdat_table_parse(enum acpi_cdat_type type, acpi_tbl_entry_handler_arg handler_arg, void *arg, - struct acpi_table_cdat *table_header); + struct acpi_table_cdat *table_header, + unsigned long length); /* CXL is the only non-ACPI consumer of the FIRMWARE_TABLE library */ #if IS_ENABLED(CONFIG_ACPI) && !IS_ENABLED(CONFIG_CXL_BUS) -- cgit v1.2.3 From 9d9539db8638cfe053fcd1f441746f0e2c8c2d32 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 12 Mar 2024 10:39:44 +0100 Subject: pidfs: remove config option As Linus suggested this enables pidfs unconditionally. A key property to retain is the ability to compare pidfds by inode number (cf. [1]). That's extremely helpful just as comparing namespace file descriptors by inode number is. They are used in a variety of scenarios where they need to be compared, e.g., when receiving a pidfd via SO_PEERPIDFD from a socket to trivially authenticate a the sender and various other use-cases. For 64bit systems this is pretty trivial to do. For 32bit it's slightly more annoying as we discussed but we simply add a dumb ida based allocator that gets used on 32bit. This gives the same guarantees about inode numbers on 64bit without any overflow risk. Practically, we'll never run into overflow issues because we're constrained by the number of processes that can exist on 32bit and by the number of open files that can exist on a 32bit system. On 64bit none of this matters and things are very simple. If 32bit also needs the uniqueness guarantee they can simply parse the contents of /proc//fd/. The uniqueness guarantees have a variety of use-cases. One of the most obvious ones is that they will make pidfiles (or "pidfdfiles", I guess) reliable as the unique identifier can be placed into there that won't be reycled. Also a frequent request. Note, I took the chance and simplified path_from_stashed() even further. Instead of passing the inode number explicitly to path_from_stashed() we let the filesystem handle that internally. So path_from_stashed() ends up even simpler than it is now. This is also a good solution allowing the cleanup code to be clean and consistent between 32bit and 64bit. The cleanup path in prepare_anon_dentry() is also switched around so we put the inode before the dentry allocation. This means we only have to call the cleanup handler for the filesystem's inode data once and can rely ->evict_inode() otherwise. Aside from having to have a bit of extra code for 32bit it actually ends up a nice cleanup for path_from_stashed() imho. Tested on both 32 and 64bit including error injection. Link: https://github.com/systemd/systemd/pull/31713 [1] Link: https://lore.kernel.org/r/20240312-dingo-sehnlich-b3ecc35c6de7@brauner Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- include/linux/pid.h | 6 +++--- include/linux/pidfs.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index c79a0efd0258..a3aad9b4074c 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -45,6 +45,8 @@ * find_pid_ns() using the int nr and struct pid_namespace *ns. */ +#define RESERVED_PIDS 300 + struct upid { int nr; struct pid_namespace *ns; @@ -55,10 +57,8 @@ struct pid refcount_t count; unsigned int level; spinlock_t lock; -#ifdef CONFIG_FS_PID struct dentry *stashed; - unsigned long ino; -#endif + u64 ino; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 40dd325a32a6..75bdf9807802 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,6 +4,5 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); -bool is_pidfs_sb(const struct super_block *sb); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From 0225bdfafd818f895fa4a4512f124a1614e011e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 1 Feb 2024 06:28:41 -0500 Subject: mempool: kvmalloc pool Add mempool_init_kvmalloc_pool() and mempool_create_kvmalloc_pool(), which wrap kvmalloc() instead of kmalloc() - kmalloc() with a vmalloc() fallback. This is part of a bcachefs cleanup - dropping an internal kvpmalloc() helper (which predates kvmalloc()) along with mempool helpers; this replaces the bcachefs-private kvpmalloc_pool. Signed-off-by: Kent Overstreet Cc: linux-mm@kvack.org --- include/linux/mempool.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7be1e32e6d42..16c5cc807ff6 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -95,6 +95,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) (void *) size); } +void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); +void mempool_kvfree(void *element, void *pool_data); + +static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); +} + +static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) +{ + return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); +} + /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data -- cgit v1.2.3 From 9448e55d032d99af8e23487f51a542d51b2f1a48 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 25 Feb 2024 14:27:11 +0000 Subject: of: Add cleanup.h based auto release via __free(device_node) markings The recent addition of scope based cleanup support to the kernel provides a convenient tool to reduce the chances of leaking reference counts where of_node_put() should have been called in an error path. This enables struct device_node *child __free(device_node) = NULL; for_each_child_of_node(np, child) { if (test) return test; } with no need for a manual call of of_node_put(). A following patch will reduce the scope of the child variable to the for loop, to avoid an issues with ordering of autocleanup, and make it obvious when this assigned a non NULL value. In this simple example the gains are small but there are some very complex error handling cases buried in these loops that will be greatly simplified by enabling early returns with out the need for this manual of_node_put() call. Note that there are coccinelle checks in scripts/coccinelle/iterators/for_each_child.cocci to detect a failure to call of_node_put(). This new approach does not cause false positives. Longer term we may want to add scripting to check this new approach is done correctly with no double of_node_put() calls being introduced due to the auto cleanup. It may also be useful to script finding places this new approach is useful. Signed-off-by: Jonathan Cameron Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240225142714.286440-2-jic23@kernel.org Signed-off-by: Rob Herring --- include/linux/of.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index d5e7acdc8c8e..2992e24cd72b 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -13,6 +13,7 @@ */ #include #include +#include #include #include #include @@ -134,6 +135,7 @@ static inline struct device_node *of_node_get(struct device_node *node) } static inline void of_node_put(struct device_node *node) { } #endif /* !CONFIG_OF_DYNAMIC */ +DEFINE_FREE(device_node, struct device_node *, if (_T) of_node_put(_T)) /* Pointer for first entry in chain of all nodes. */ extern struct device_node *of_root; -- cgit v1.2.3 From 34af4554fb0ce164e2c4876683619eb1e23848d4 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 25 Feb 2024 14:27:12 +0000 Subject: of: Introduce for_each_*_child_of_node_scoped() to automate of_node_put() handling To avoid issues with out of order cleanup, or ambiguity about when the auto freed data is first instantiated, do it within the for loop definition. The disadvantage is that the struct device_node *child variable creation is not immediately obvious where this is used. However, in many cases, if there is another definition of struct device_node *child; the compiler / static analysers will notify us that it is unused, or uninitialized. Note that, in the vast majority of cases, the _available_ form should be used and as code is converted to these scoped handers, we should confirm that any cases that do not check for available have a good reason not to. Signed-off-by: Jonathan Cameron Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240225142714.286440-3-jic23@kernel.org Signed-off-by: Rob Herring --- include/linux/of.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 2992e24cd72b..4677e50d52b7 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1420,10 +1420,23 @@ static inline int of_property_read_s32(const struct device_node *np, #define for_each_child_of_node(parent, child) \ for (child = of_get_next_child(parent, NULL); child != NULL; \ child = of_get_next_child(parent, child)) + +#define for_each_child_of_node_scoped(parent, child) \ + for (struct device_node *child __free(device_node) = \ + of_get_next_child(parent, NULL); \ + child != NULL; \ + child = of_get_next_child(parent, child)) + #define for_each_available_child_of_node(parent, child) \ for (child = of_get_next_available_child(parent, NULL); child != NULL; \ child = of_get_next_available_child(parent, child)) +#define for_each_available_child_of_node_scoped(parent, child) \ + for (struct device_node *child __free(device_node) = \ + of_get_next_available_child(parent, NULL); \ + child != NULL; \ + child = of_get_next_available_child(parent, child)) + #define for_each_of_cpu_node(cpu) \ for (cpu = of_get_next_cpu_node(NULL); cpu != NULL; \ cpu = of_get_next_cpu_node(cpu)) -- cgit v1.2.3 From 66a67c860cce3643248f7e80ee095b946829a342 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 28 Feb 2024 18:28:48 -0500 Subject: fs: file_remove_privs_flags() Rename and export __file_remove_privs(); for a buffered write path that doesn't take the inode lock we need to be able to check if the operation needs to do work first. Signed-off-by: Kent Overstreet Cc: Alexander Viro Cc: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1fbc72c5f112..14ea66b62823 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3004,6 +3004,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); +extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); -- cgit v1.2.3 From 3a319a2476d27e0b6c3cac3ebf6e3d0b665a06e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 7 Mar 2024 22:32:06 -0500 Subject: lib/generic-radix-tree.c: Make nodes more reasonably sized this code originally used the page allocator directly, but most code shouldn't do that - PAGE_SIZE varies with architecture, and slab is faster. 4k is also on the large side for typical usage, 512 bytes is a better choice for typical usage that might be somewhat sparse. Signed-off-by: Kent Overstreet --- include/linux/generic-radix-tree.h | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 847413164738..f3512fddf3d7 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -5,7 +5,7 @@ * DOC: Generic radix trees/sparse arrays * * Very simple and minimalistic, supporting arbitrary size entries up to - * PAGE_SIZE. + * GENRADIX_NODE_SIZE. * * A genradix is defined with the type it will store, like so: * @@ -45,12 +45,15 @@ struct genradix_root; +#define GENRADIX_NODE_SHIFT 9 +#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) + struct __genradix { struct genradix_root *root; }; /* - * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE: + * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: */ #define __GENRADIX_INITIALIZER \ @@ -101,14 +104,14 @@ void __genradix_free(struct __genradix *); static inline size_t __idx_to_offset(size_t idx, size_t obj_size) { if (__builtin_constant_p(obj_size)) - BUILD_BUG_ON(obj_size > PAGE_SIZE); + BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE); else - BUG_ON(obj_size > PAGE_SIZE); + BUG_ON(obj_size > GENRADIX_NODE_SIZE); if (!is_power_of_2(obj_size)) { - size_t objs_per_page = PAGE_SIZE / obj_size; + size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size; - return (idx / objs_per_page) * PAGE_SIZE + + return (idx / objs_per_page) * GENRADIX_NODE_SIZE + (idx % objs_per_page) * obj_size; } else { return idx * obj_size; @@ -118,9 +121,9 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) #define __genradix_objs_per_page(_radix) \ - (PAGE_SIZE / sizeof((_radix)->type[0])) + (GENRADIX_NODE_SIZE / sizeof((_radix)->type[0])) #define __genradix_page_remainder(_radix) \ - (PAGE_SIZE % sizeof((_radix)->type[0])) + (GENRADIX_NODE_SIZE % sizeof((_radix)->type[0])) #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) @@ -217,8 +220,8 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, iter->offset += obj_size; if (!is_power_of_2(obj_size) && - (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE) - iter->offset = round_up(iter->offset, PAGE_SIZE); + (iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE) + iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE); iter->pos++; } @@ -235,8 +238,8 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter, return; } - if ((iter->offset & (PAGE_SIZE - 1)) == 0) - iter->offset -= PAGE_SIZE % obj_size; + if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0) + iter->offset -= GENRADIX_NODE_SIZE % obj_size; iter->offset -= obj_size; iter->pos--; @@ -263,7 +266,7 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter, genradix_for_each_from(_radix, _iter, _p, 0) #define genradix_last_pos(_radix) \ - (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) + (SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1) /** * genradix_for_each_reverse - iterate over entry in a genradix, reverse order -- cgit v1.2.3 From a5a858f622a0aff5cdb5e271442cd01b2a01467f Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Thu, 14 Mar 2024 11:31:26 -0400 Subject: lsm: use 32-bit compatible data types in LSM syscalls Change the size parameters in lsm_list_modules(), lsm_set_self_attr() and lsm_get_self_attr() from size_t to u32. This avoids the need to have different interfaces for 32 and 64 bit systems. Cc: stable@vger.kernel.org Fixes: a04a1198088a ("LSM: syscalls for current process attributes") Fixes: ad4aff9ec25f ("LSM: Create lsm_list_modules system call") Signed-off-by: Casey Schaufler Reported-and-reviewed-by: Dmitry V. Levin [PM: subject and metadata tweaks, syscall.h fixes] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/security.h | 8 ++++---- include/linux/syscalls.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index a8057a3f8de6..334e00efbde4 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -280,9 +280,9 @@ LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr, - struct lsm_ctx __user *ctx, size_t *size, u32 flags) + struct lsm_ctx __user *ctx, u32 *size, u32 flags) LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr, - struct lsm_ctx *ctx, size_t size, u32 flags) + struct lsm_ctx *ctx, u32 size, u32 flags) LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) diff --git a/include/linux/security.h b/include/linux/security.h index f249f5b9a9d7..41a8f667bdfa 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -491,9 +491,9 @@ int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter); void security_d_instantiate(struct dentry *dentry, struct inode *inode); int security_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, - size_t __user *size, u32 flags); + u32 __user *size, u32 flags); int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx, - size_t size, u32 flags); + u32 size, u32 flags); int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value); int security_setprocattr(int lsmid, const char *name, void *value, size_t size); @@ -507,7 +507,7 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); int security_locked_down(enum lockdown_reason what); -int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len, +int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len, void *val, size_t val_len, u64 id, u64 flags); #else /* CONFIG_SECURITY */ @@ -1478,7 +1478,7 @@ static inline int security_locked_down(enum lockdown_reason what) return 0; } static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, - size_t *uctx_len, void *val, size_t val_len, + u32 *uctx_len, void *val, size_t val_len, u64 id, u64 flags) { return -EOPNOTSUPP; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 77eb9b0e7685..e619ac10cd23 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -960,10 +960,10 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, - size_t *size, __u32 flags); + u32 *size, u32 flags); asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, - size_t size, __u32 flags); -asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags); + u32 size, u32 flags); +asmlinkage long sys_lsm_list_modules(u64 *ids, u32 *size, u32 flags); /* * Architecture-specific system calls -- cgit v1.2.3 From e54e09c05c00120cbe817bdb037088035be4bd79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2024 09:55:45 -0600 Subject: net: remove {revc,send}msg_copy_msghdr() from exports The only user of these was io_uring, and it's not using them anymore. Make them static and remove them from the socket header file. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/1b6089d3-c1cf-464a-abd3-b0f0b6bb2523@kernel.dk Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index cfcb7e2c3813..139c330ccf2c 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -422,13 +422,6 @@ extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags); -extern int sendmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct iovec **iov); -extern int recvmsg_copy_msghdr(struct msghdr *msg, - struct user_msghdr __user *umsg, unsigned flags, - struct sockaddr __user **uaddr, - struct iovec **iov); extern int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *umsg, struct sockaddr __user **save_addr); -- cgit v1.2.3 From 152609795dbf02f004c86049b75c23f4e68071d8 Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sat, 16 Mar 2024 01:10:21 +0100 Subject: fbcon: Increase maximum font width x height to 64 x 128 By using bitmaps we actually support whatever size we would want, but the console currently limits fonts to 64x128 (which gives 60x16 text on 4k screens), so we don't need more for now, and we can easily increase later. Signed-off-by: Samuel Thibault Signed-off-by: Helge Deller --- include/linux/fb.h | 18 ++++++++++++------ include/linux/font.h | 3 ++- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 05dc9624897d..7d7c7791fd26 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -143,9 +143,13 @@ struct fb_event { void *data; }; +/* Enough for the VT console needs, see its max_font_width/height */ +#define FB_MAX_BLIT_WIDTH 64 +#define FB_MAX_BLIT_HEIGHT 128 + struct fb_blit_caps { - u32 x; - u32 y; + DECLARE_BITMAP(x, FB_MAX_BLIT_WIDTH); + DECLARE_BITMAP(y, FB_MAX_BLIT_HEIGHT); u32 len; u32 flags; }; @@ -192,10 +196,12 @@ struct fb_pixmap { u32 scan_align; /* alignment per scanline */ u32 access_align; /* alignment per read/write (bits) */ u32 flags; /* see FB_PIXMAP_* */ - u32 blit_x; /* supported bit block dimensions (1-32)*/ - u32 blit_y; /* Format: blit_x = 1 << (width - 1) */ - /* blit_y = 1 << (height - 1) */ - /* if 0, will be set to 0xffffffff (all)*/ + /* supported bit block dimensions */ + /* Format: test_bit(width - 1, blit_x) */ + /* test_bit(height - 1, blit_y) */ + /* if zero, will be set to full (all) */ + DECLARE_BITMAP(blit_x, FB_MAX_BLIT_WIDTH); + DECLARE_BITMAP(blit_y, FB_MAX_BLIT_HEIGHT); /* access methods */ void (*writeio)(struct fb_info *info, void __iomem *dst, void *src, unsigned int size); void (*readio) (struct fb_info *info, void *dst, void __iomem *src, unsigned int size); diff --git a/include/linux/font.h b/include/linux/font.h index abf1442ce719..81caffd51bb4 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -57,7 +57,8 @@ extern const struct font_desc *find_font(const char *name); /* Get the default font for a specific screen size */ extern const struct font_desc *get_default_font(int xres, int yres, - u32 font_w, u32 font_h); + unsigned long *font_w, + unsigned long *font_h); /* Max. length for the name of a predefined font */ #define MAX_FONT_NAME 32 -- cgit v1.2.3 From 56a34d799bfa53064e7b8bd354aacd176aeaecc8 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 26 Feb 2024 16:00:08 +0530 Subject: kexec/kdump: make struct crash_mem available without CONFIG_CRASH_DUMP struct crash_mem defined under include/linux/crash_core.h represents a list of memory ranges. While it is used to represent memory ranges for kdump kernel, it can also be used for other kind of memory ranges. In fact, KEXEC_FILE_LOAD syscall in powerpc uses this structure to represent reserved memory ranges and exclude memory ranges needed to find the right memory regions to load kexec kernel. So, make the definition of crash_mem structure available for !CONFIG_CRASH_DUMP case too. Signed-off-by: Hari Bathini Acked-by: Baoquan He Signed-off-by: Michael Ellerman Link: https://msgid.link/20240226103010.589537-2-hbathini@linux.ibm.com --- include/linux/crash_core.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 23270b16e1db..d33352c2e386 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -8,6 +8,12 @@ struct kimage; +struct crash_mem { + unsigned int max_nr_ranges; + unsigned int nr_ranges; + struct range ranges[] __counted_by(max_nr_ranges); +}; + #ifdef CONFIG_CRASH_DUMP int crash_shrink_memory(unsigned long new_size); @@ -51,12 +57,6 @@ static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 -struct crash_mem { - unsigned int max_nr_ranges; - unsigned int nr_ranges; - struct range ranges[] __counted_by(max_nr_ranges); -}; - extern int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend); -- cgit v1.2.3 From 59a55a63c24624c7ad268f12c8f82d142ef6a6d4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 14 Mar 2024 15:24:13 +0100 Subject: fs,block: get holder during claim Now that we open block devices as files we need to deal with the realities that closing is a deferred operation. An operation on the block device such as e.g., freeze, thaw, or removal that runs concurrently with umount, tries to acquire a stable reference on the holder. The holder might already be gone though. Make that reliable by grabbing a passive reference to the holder during bdev_open() and releasing it during bdev_release(). Fixes: f3a608827d1f ("bdev: open block device as files") # mainline only Reported-by: Christoph Hellwig Link: https://lore.kernel.org/r/ZfEQQ9jZZVes0WCZ@infradead.org Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Yi Zhang Reported-by: https://lore.kernel.org/r/CAHj4cs8tbDwKRwfS1=DmooP73ysM__xAb2PQc6XsAmWR+VuYmg@mail.gmail.com Link: https://lore.kernel.org/r/20240315-freibad-annehmbar-ca68c375af91@brauner Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f9b87c39cab0..c3e8f7cf96be 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1505,6 +1505,16 @@ struct blk_holder_ops { * Thaw the file system mounted on the block device. */ int (*thaw)(struct block_device *bdev); + + /* + * If needed, get a reference to the holder. + */ + void (*get_holder)(void *holder); + + /* + * Release the holder. + */ + void (*put_holder)(void *holder); }; /* -- cgit v1.2.3 From c3198822c6cb9fb588e446540485669cc81c5d34 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 8 Mar 2024 17:26:00 +0200 Subject: net: esp: fix bad handling of pages from page_pool When the skb is reorganized during esp_output (!esp->inline), the pages coming from the original skb fragments are supposed to be released back to the system through put_page. But if the skb fragment pages are originating from a page_pool, calling put_page on them will trigger a page_pool leak which will eventually result in a crash. This leak can be easily observed when using CONFIG_DEBUG_VM and doing ipsec + gre (non offloaded) forwarding: BUG: Bad page state in process ksoftirqd/16 pfn:1451b6 page:00000000de2b8d32 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1451b6000 pfn:0x1451b6 flags: 0x200000000000000(node=0|zone=2) page_type: 0xffffffff() raw: 0200000000000000 dead000000000040 ffff88810d23c000 0000000000000000 raw: 00000001451b6000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: page_pool leak Modules linked in: ip_gre gre mlx5_ib mlx5_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core overlay zram zsmalloc fuse [last unloaded: mlx5_core] CPU: 16 PID: 96 Comm: ksoftirqd/16 Not tainted 6.8.0-rc4+ #22 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x36/0x50 bad_page+0x70/0xf0 free_unref_page_prepare+0x27a/0x460 free_unref_page+0x38/0x120 esp_ssg_unref.isra.0+0x15f/0x200 esp_output_tail+0x66d/0x780 esp_xmit+0x2c5/0x360 validate_xmit_xfrm+0x313/0x370 ? validate_xmit_skb+0x1d/0x330 validate_xmit_skb_list+0x4c/0x70 sch_direct_xmit+0x23e/0x350 __dev_queue_xmit+0x337/0xba0 ? nf_hook_slow+0x3f/0xd0 ip_finish_output2+0x25e/0x580 iptunnel_xmit+0x19b/0x240 ip_tunnel_xmit+0x5fb/0xb60 ipgre_xmit+0x14d/0x280 [ip_gre] dev_hard_start_xmit+0xc3/0x1c0 __dev_queue_xmit+0x208/0xba0 ? nf_hook_slow+0x3f/0xd0 ip_finish_output2+0x1ca/0x580 ip_sublist_rcv_finish+0x32/0x40 ip_sublist_rcv+0x1b2/0x1f0 ? ip_rcv_finish_core.constprop.0+0x460/0x460 ip_list_rcv+0x103/0x130 __netif_receive_skb_list_core+0x181/0x1e0 netif_receive_skb_list_internal+0x1b3/0x2c0 napi_gro_receive+0xc8/0x200 gro_cell_poll+0x52/0x90 __napi_poll+0x25/0x1a0 net_rx_action+0x28e/0x300 __do_softirq+0xc3/0x276 ? sort_range+0x20/0x20 run_ksoftirqd+0x1e/0x30 smpboot_thread_fn+0xa6/0x130 kthread+0xcd/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x31/0x50 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 The suggested fix is to introduce a new wrapper (skb_page_unref) that covers page refcounting for page_pool pages as well. Cc: stable@vger.kernel.org Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reported-and-tested-by: Anatoli N.Chechelnickiy Reported-by: Ian Kumlien Link: https://lore.kernel.org/netdev/CAA85sZvvHtrpTQRqdaOx6gd55zPAVsqMYk_Lwh4Md5knTq7AyA@mail.gmail.com Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Reviewed-by: Jakub Kicinski Acked-by: Ilias Apalodimas Signed-off-by: Steffen Klassert --- include/linux/skbuff.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3023bc2be6a1..b49a7d6591e8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3523,6 +3523,16 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, struct bpf_prog *prog); bool napi_pp_put_page(struct page *page, bool napi_safe); +static inline void +skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe) +{ +#ifdef CONFIG_PAGE_POOL + if (skb->pp_recycle && napi_pp_put_page(page, napi_safe)) + return; +#endif + put_page(page); +} + static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { -- cgit v1.2.3 From 01c0cce88c5480cc2505b79330246ef12eda938f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 28 Feb 2024 08:35:32 +0200 Subject: drm/omapdrm: Fix console with deferred ops Commit 95da53d63dcf ("drm/omapdrm: Use regular fbdev I/O helpers") stopped console from updating for command mode displays because there is no damage handling in fb_sys_write() unlike we had earlier in drm_fb_helper_sys_write(). Let's fix the issue by adding FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS and FB_DMAMEM_HELPERS_DEFERRED as suggested by Thomas. We cannot use the FB_DEFAULT_DEFERRED_OPS as fb_deferred_io_mmap() won't work properly for write-combine. Fixes: 95da53d63dcf ("drm/omapdrm: Use regular fbdev I/O helpers") Suggested-by: Thomas Zimmermann Reviewed-by: Thomas Zimmermann Signed-off-by: Tony Lindgren Signed-off-by: Tomi Valkeinen Link: https://patchwork.freedesktop.org/patch/msgid/20240228063540.4444-3-tony@atomide.com --- include/linux/fb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 708e6a177b1b..5e210bf72fc9 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -688,6 +688,10 @@ extern int fb_deferred_io_fsync(struct file *file, loff_t start, __FB_GEN_DEFAULT_DEFERRED_OPS_RDWR(__prefix, __damage_range, sys) \ __FB_GEN_DEFAULT_DEFERRED_OPS_DRAW(__prefix, __damage_area, sys) +#define FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS(__prefix, __damage_range, __damage_area) \ + __FB_GEN_DEFAULT_DEFERRED_OPS_RDWR(__prefix, __damage_range, sys) \ + __FB_GEN_DEFAULT_DEFERRED_OPS_DRAW(__prefix, __damage_area, sys) + /* * Initializes struct fb_ops for deferred I/O. */ -- cgit v1.2.3 From 35c3e27917568192927c785fc380f139255468b4 Mon Sep 17 00:00:00 2001 From: Abhishek Chauhan Date: Thu, 14 Mar 2024 12:24:04 -0700 Subject: Revert "net: Re-use and set mono_delivery_time bit for userspace tstamp packets" This reverts commit 885c36e59f46375c138de18ff1692f18eff67b7f. The patch currently broke the bpf selftest test_tc_dtime because uapi field __sk_buff->tstamp_type depends on skb->mono_delivery_time which does not necessarily mean mono with the original fix as the bit was re-used for userspace timestamp as well to avoid tstamp reset in the forwarding path. To solve this we need to keep mono_delivery_time as is and introduce another bit called user_delivery_time and fall back to the initial proposal of setting the user_delivery_time bit based on sk_clockid set from userspace. Fixes: 885c36e59f46 ("net: Re-use and set mono_delivery_time bit for userspace tstamp packets") Link: https://lore.kernel.org/netdev/bc037db4-58bb-4861-ac31-a361a93841d3@linux.dev/ Signed-off-by: Abhishek Chauhan Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3023bc2be6a1..7d56ce195120 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -822,9 +822,9 @@ typedef unsigned char *sk_buff_data_t; * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required * @mono_delivery_time: When set, skb->tstamp has the - * delivery_time in mono clock base (i.e., EDT) or a clock base chosen - * by SO_TXTIME. If zero, skb->tstamp has the (rcv) timestamp at - * ingress. + * delivery_time in mono clock base (i.e. EDT). Otherwise, the + * skb->tstamp has the (rcv) timestamp at ingress and + * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. -- cgit v1.2.3 From 70a6ed553f7d3504febac467cb4a0bae621ba3c6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 22 Feb 2024 16:14:19 -0500 Subject: tracing: Use EVENT_NULL_STR macro instead of open coding "(null)" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TRACE_EVENT macros has some dependency if a __string() field is NULL, where it will save "(null)" as the string. This string is also used by __assign_str(). It's better to create a single macro instead of having something that will not be caught by the compiler if there is an unfortunate typo. Link: https://lore.kernel.org/linux-trace-kernel/20240222211443.106216915@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ville Syrjälä Cc: Rodrigo Vivi Cc: Chuck Lever Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index fc6d0af56bb1..6f9bdfb09d1d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -17,6 +17,9 @@ struct dentry; struct bpf_prog; union bpf_attr; +/* Used for event string fields when they are NULL */ +#define EVENT_NULL_STR "(null)" + const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); -- cgit v1.2.3 From 19f0423fd55c301c8edaea286e568ec657f42750 Mon Sep 17 00:00:00 2001 From: Huang Yiwei Date: Fri, 23 Feb 2024 16:31:26 +0800 Subject: tracing: Support to dump instance traces by ftrace_dump_on_oops Currently ftrace only dumps the global trace buffer on an OOPs. For debugging a production usecase, instance trace will be helpful to check specific problems since global trace buffer may be used for other purposes. This patch extend the ftrace_dump_on_oops parameter to dump a specific or multiple trace instances: - ftrace_dump_on_oops=0: as before -- don't dump - ftrace_dump_on_oops[=1]: as before -- dump the global trace buffer on all CPUs - ftrace_dump_on_oops=2 or =orig_cpu: as before -- dump the global trace buffer on CPU that triggered the oops - ftrace_dump_on_oops=: new behavior -- dump the tracing instance matching - ftrace_dump_on_oops[=2/orig_cpu],[=2/orig_cpu], [=2/orig_cpu]: new behavior -- dump the global trace buffer and multiple instance buffer on all CPUs, or only dump on CPU that triggered the oops if =2 or =orig_cpu is given Also, the sysctl node can handle the input accordingly. Link: https://lore.kernel.org/linux-trace-kernel/20240223083126.1817731-1-quic_hyiwei@quicinc.com Cc: Ross Zwisler Cc: Cc: Cc: Cc: Cc: Cc: Cc: Signed-off-by: Huang Yiwei Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 4 +++- include/linux/kernel.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e8921871ef9a..54d53f345d14 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1151,7 +1151,9 @@ static inline void unpause_graph_tracing(void) { } #ifdef CONFIG_TRACING enum ftrace_dump_mode; -extern enum ftrace_dump_mode ftrace_dump_on_oops; +#define MAX_TRACER_SIZE 100 +extern char ftrace_dump_on_oops[]; +extern int ftrace_dump_on_oops_enabled(void); extern int tracepoint_printk; extern void disable_trace_on_warning(void); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d718fbec72dd..be2e8c0a187e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -215,6 +215,7 @@ enum ftrace_dump_mode { DUMP_NONE, DUMP_ALL, DUMP_ORIG, + DUMP_PARAM, }; #ifdef CONFIG_TRACING -- cgit v1.2.3 From 1b273124107cc8b9dd52228eba701efa516a3d92 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 28 Feb 2024 13:31:12 -0500 Subject: tracepoints: Use WARN() and not WARN_ON() for warnings There are two WARN_ON*() warnings in tracepoint.h that deal with RCU usage. But when they trigger, especially from using a TRACE_EVENT() macro, the information is not very helpful and is confusing: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at include/trace/events/lock.h:24 lock_acquire+0x2b2/0x2d0 Where the above warning takes you to: TRACE_EVENT(lock_acquire, <<<--- line 24 in lock.h TP_PROTO(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *next_lock, unsigned long ip), [..] Change the WARN_ON_ONCE() to WARN_ONCE() and add a string that allows someone to search for exactly where the bug happened. Link: https://lore.kernel.org/linux-trace-kernel/20240228133112.0d64fb1b@gandalf.local.home Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Thomas Gleixner Reported-by: Borislav Petkov Tested-by: Borislav Petkov (AMD) Reviewed-by: Paul E. McKenney Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 88c0ba623ee6..689b6d71590e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -199,7 +199,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (!(cond)) \ return; \ \ - if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \ + if (WARN_ONCE(RCUIDLE_COND(rcuidle), \ + "Bad RCU usage for tracepoint")) \ return; \ \ /* keep srcu and sched-rcu usage consistent */ \ @@ -259,7 +260,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) TP_ARGS(args), \ TP_CONDITION(cond), 0); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ - WARN_ON_ONCE(!rcu_is_watching()); \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ } \ } \ __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ -- cgit v1.2.3 From 0a926fc972532788719fd03c4a44724ec23c1875 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Sat, 3 Feb 2024 00:38:57 +0800 Subject: vDPA: introduce get_vq_size to vdpa_config_ops This commit introduces a new interface get_vq_size to vDPA config ops, this new interface intends to report the size of a specific virtqueue Signed-off-by: Zhu Lingshan Message-Id: <20240202163905.8834-3-lingshan.zhu@intel.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index db15ac07f8a6..4097e8e92860 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -195,6 +195,10 @@ struct vdpa_map_file { * @idx: virtqueue index * Returns int: irq number of a virtqueue, * negative number if no irq assigned. + * @get_vq_size: Get the size of a specific virtqueue (optional) + * @vdev: vdpa device + * @idx: virtqueue index + * Return u16: the size of the virtqueue * @get_vq_align: Get the virtqueue align requirement * for the device * @vdev: vdpa device @@ -386,6 +390,7 @@ struct vdpa_config_ops { (*get_vq_notification)(struct vdpa_device *vdev, u16 idx); /* vq irq is not expected to be changed once DRIVER_OK is set */ int (*get_vq_irq)(struct vdpa_device *vdev, u16 idx); + u16 (*get_vq_size)(struct vdpa_device *vdev, u16 idx); /* Device ops */ u32 (*get_vq_align)(struct vdpa_device *vdev); -- cgit v1.2.3 From c2475a9a789721bfdcc1b16aaf61ccfecb891914 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Mon, 19 Feb 2024 02:55:57 +0800 Subject: vDPA: report virtio-block capacity to user space This commit allows userspace to query capacity of a virtio-block device. Signed-off-by: Zhu Lingshan Message-Id: <20240218185606.13509-2-lingshan.zhu@intel.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 4097e8e92860..7977ca03ac7a 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -7,6 +7,7 @@ #include #include #include +#include #include /** -- cgit v1.2.3 From f6e0a4984c2e7244689ea87b62b433bed9d07e94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2024 20:08:45 +0000 Subject: net: move dev->state into net_device_read_txrx group dev->state can be read in rx and tx fast paths. netif_running() which needs dev->state is called from - enqueue_to_backlog() [RX path] - __dev_direct_xmit() [TX path] Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240314200845.3050179-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c6f6ac779b34..cb37817d6382 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2072,6 +2072,7 @@ struct net_device { struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; }; + unsigned long state; unsigned int flags; unsigned short hard_header_len; netdev_features_t features; @@ -2117,7 +2118,6 @@ struct net_device { * part of the usual set specified in Space.c. */ - unsigned long state; struct list_head dev_list; struct list_head napi_list; -- cgit v1.2.3 From 2d9d9f256c8c85049306df3131ec7c81f9d8317c Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Thu, 14 Mar 2024 13:00:06 +0100 Subject: lib/bitmap: Fix bitmap_scatter() and bitmap_gather() kernel doc The make htmldoc command failed with the following error ... include/linux/bitmap.h:524: ERROR: Unexpected indentation. ... include/linux/bitmap.h:524: CRITICAL: Unexpected section title or transition. Move the visual representation to a literal block. Fixes: de5f84338970 ("lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-kernel/20240312153059.3ffde1b7@canb.auug.org.au/ Signed-off-by: Herve Codina Reviewed-by: Andy Shevchenko Reviewed-by: Bagas Sanjaya Acked-by: Yury Norov Link: https://lore.kernel.org/r/20240314120006.458580-1-herve.codina@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/bitmap.h | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index fb3a9c93ac86..aa4096126553 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -522,17 +522,18 @@ static inline void bitmap_replace(unsigned long *dst, * * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12) * - * A more 'visual' description of the operation: - * src: 0000000001011010 - * |||||| - * +------+||||| - * | +----+|||| - * | |+----+||| - * | || +-+|| - * | || | || - * mask: ...v..vv...v..vv - * ...0..11...0..10 - * dst: 0000001100000010 + * A more 'visual' description of the operation:: + * + * src: 0000000001011010 + * |||||| + * +------+||||| + * | +----+|||| + * | |+----+||| + * | || +-+|| + * | || | || + * mask: ...v..vv...v..vv + * ...0..11...0..10 + * dst: 0000001100000010 * * A relationship exists between bitmap_scatter() and bitmap_gather(). * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. @@ -568,16 +569,17 @@ static inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, * * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5) * - * A more 'visual' description of the operation: - * mask: ...v..vv...v..vv - * src: 0000001100000010 - * ^ ^^ ^ 0 - * | || | 10 - * | || > 010 - * | |+--> 1010 - * | +--> 11010 - * +----> 011010 - * dst: 0000000000011010 + * A more 'visual' description of the operation:: + * + * mask: ...v..vv...v..vv + * src: 0000001100000010 + * ^ ^^ ^ 0 + * | || | 10 + * | || > 010 + * | |+--> 1010 + * | +--> 11010 + * +----> 011010 + * dst: 0000000000011010 * * A relationship exists between bitmap_gather() and bitmap_scatter(). See * bitmap_scatter() for the bitmap scatter detailed operations. -- cgit v1.2.3 From 1a77557d48cff187a169c2aec01c0dd78a5e7e50 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Tue, 19 Mar 2024 13:44:34 -0700 Subject: rcu: add a helper to report consolidated flavor QS When under heavy load, network processing can run CPU-bound for many tens of seconds. Even in preemptible kernels (non-RT kernel), this can block RCU Tasks grace periods, which can cause trace-event removal to take more than a minute, which is unacceptably long. This commit therefore creates a new helper function that passes through both RCU and RCU-Tasks quiescent states every 100 milliseconds. This hard-coded value suffices for current workloads. Suggested-by: Paul E. McKenney Reviewed-by: Jesper Dangaard Brouer Signed-off-by: Yan Zhai Reviewed-by: Paul E. McKenney Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/90431d46ee112d2b0af04dbfe936faaca11810a5.1710877680.git.yan@cloudflare.com Signed-off-by: Jakub Kicinski --- include/linux/rcupdate.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 16f519914415..17d7ed5f3ae6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -247,6 +247,37 @@ do { \ cond_resched(); \ } while (0) +/** + * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states + * @old_ts: jiffies at start of processing. + * + * This helper is for long-running softirq handlers, such as NAPI threads in + * networking. The caller should initialize the variable passed in as @old_ts + * at the beginning of the softirq handler. When invoked frequently, this macro + * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will + * provide both RCU and RCU-Tasks quiescent states. Note that this macro + * modifies its old_ts argument. + * + * Because regions of code that have disabled softirq act as RCU read-side + * critical sections, this macro should be invoked with softirq (and + * preemption) enabled. + * + * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would + * have more chance to invoke schedule() calls and provide necessary quiescent + * states. As a contrast, calling cond_resched() only won't achieve the same + * effect because cond_resched() does not provide RCU-Tasks quiescent states. + */ +#define rcu_softirq_qs_periodic(old_ts) \ +do { \ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \ + time_after(jiffies, (old_ts) + HZ / 10)) { \ + preempt_disable(); \ + rcu_softirq_qs(); \ + preempt_enable(); \ + (old_ts) = jiffies; \ + } \ +} while (0) + /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. -- cgit v1.2.3 From 203a6763ab699da0568fd2b76303d03bb121abd4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 13 Mar 2024 16:32:27 -0700 Subject: Revert "crypto: pkcs7 - remove sha1 support" This reverts commit 16ab7cb5825fc3425c16ad2c6e53d827f382d7c6 because it broke iwd. iwd uses the KEYCTL_PKEY_* UAPIs via its dependency libell, and apparently it is relying on SHA-1 signature support. These UAPIs are fairly obscure, and their documentation does not mention which algorithms they support. iwd really should be using a properly supported userspace crypto library instead. Regardless, since something broke we have to revert the change. It may be possible that some parts of this commit can be reinstated without breaking iwd (e.g. probably the removal of MODULE_SIG_SHA1), but for now this just does a full revert to get things working again. Reported-by: Karel Balej Closes: https://lore.kernel.org/r/CZSHRUIJ4RKL.34T4EASV5DNJM@matfyz.cz Cc: Dimitri John Ledkov Signed-off-by: Eric Biggers Tested-by: Karel Balej Signed-off-by: Herbert Xu --- include/linux/oid_registry.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h index 3921fbed0b28..51421fdbb0ba 100644 --- a/include/linux/oid_registry.h +++ b/include/linux/oid_registry.h @@ -17,10 +17,12 @@ * build_OID_registry.pl to generate the data for look_up_OID(). */ enum OID { + OID_id_dsa_with_sha1, /* 1.2.840.10030.4.3 */ OID_id_dsa, /* 1.2.840.10040.4.1 */ OID_id_ecPublicKey, /* 1.2.840.10045.2.1 */ OID_id_prime192v1, /* 1.2.840.10045.3.1.1 */ OID_id_prime256v1, /* 1.2.840.10045.3.1.7 */ + OID_id_ecdsa_with_sha1, /* 1.2.840.10045.4.1 */ OID_id_ecdsa_with_sha224, /* 1.2.840.10045.4.3.1 */ OID_id_ecdsa_with_sha256, /* 1.2.840.10045.4.3.2 */ OID_id_ecdsa_with_sha384, /* 1.2.840.10045.4.3.3 */ @@ -28,6 +30,7 @@ enum OID { /* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */ OID_rsaEncryption, /* 1.2.840.113549.1.1.1 */ + OID_sha1WithRSAEncryption, /* 1.2.840.113549.1.1.5 */ OID_sha256WithRSAEncryption, /* 1.2.840.113549.1.1.11 */ OID_sha384WithRSAEncryption, /* 1.2.840.113549.1.1.12 */ OID_sha512WithRSAEncryption, /* 1.2.840.113549.1.1.13 */ @@ -64,6 +67,7 @@ enum OID { OID_PKU2U, /* 1.3.5.1.5.2.7 */ OID_Scram, /* 1.3.6.1.5.5.14 */ OID_certAuthInfoAccess, /* 1.3.6.1.5.5.7.1.1 */ + OID_sha1, /* 1.3.14.3.2.26 */ OID_id_ansip384r1, /* 1.3.132.0.34 */ OID_sha256, /* 2.16.840.1.101.3.4.2.1 */ OID_sha384, /* 2.16.840.1.101.3.4.2.2 */ -- cgit v1.2.3 From d8e45f2929b94099913eb66c3ebb18b5063e9421 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 Mar 2024 15:51:36 -0800 Subject: overflow: Change DEFINE_FLEX to take __counted_by member The norm should be flexible array structures with __counted_by annotations, so DEFINE_FLEX() is updated to expect that. Rename the non-annotated version to DEFINE_RAW_FLEX(), and update the few existing users. Additionally add selftests for the macros. Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240306235128.it.933-kees@kernel.org Reviewed-by: Przemek Kitszel Signed-off-by: Kees Cook --- include/linux/overflow.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index aa691f2119b0..0c7e3dcfe867 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -398,7 +398,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * @count: Number of elements in the array; must be compile-time const. * @initializer: initializer expression (could be empty for no init). */ -#define _DEFINE_FLEX(type, name, member, count, initializer) \ +#define _DEFINE_FLEX(type, name, member, count, initializer...) \ _Static_assert(__builtin_constant_p(count), \ "onstack flex array members require compile-time const count"); \ union { \ @@ -408,8 +408,8 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) type *name = (type *)&name##_u /** - * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing - * flexible array member. + * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing + * flexible array member, when it does not have a __counted_by annotation. * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. @@ -420,7 +420,24 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * flexible array member. * Use __struct_size(@name) to get compile-time size of it afterwards. */ -#define DEFINE_FLEX(type, name, member, count) \ +#define DEFINE_RAW_FLEX(type, name, member, count) \ _DEFINE_FLEX(type, name, member, count, = {}) +/** + * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing + * flexible array member. + * + * @TYPE: structure type name, including "struct" keyword. + * @NAME: Name for a variable to define. + * @MEMBER: Name of the array member. + * @COUNTER: Name of the __counted_by member. + * @COUNT: Number of elements in the array; must be compile-time const. + * + * Define a zeroed, on-stack, instance of @TYPE structure with a trailing + * flexible array member. + * Use __struct_size(@NAME) to get compile-time size of it afterwards. + */ +#define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ + _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .obj.COUNTER = COUNT, }) + #endif /* __LINUX_OVERFLOW_H */ -- cgit v1.2.3 From 0c76106cb97548810214def8ee22700bbbb90543 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 19 Mar 2024 16:12:09 +0900 Subject: scsi: sd: Fix TCG OPAL unlock on system resume Commit 3cc2ffe5c16d ("scsi: sd: Differentiate system and runtime start/stop management") introduced the manage_system_start_stop scsi_device flag to allow libata to indicate to the SCSI disk driver that nothing should be done when resuming a disk on system resume. This change turned the execution of sd_resume() into a no-op for ATA devices on system resume. While this solved deadlock issues during device resume, this change also wrongly removed the execution of opal_unlock_from_suspend(). As a result, devices with TCG OPAL locking enabled remain locked and inaccessible after a system resume from sleep. To fix this issue, introduce the SCSI driver resume method and implement it with the sd_resume() function calling opal_unlock_from_suspend(). The former sd_resume() function is renamed to sd_resume_common() and modified to call the new sd_resume() function. For non-ATA devices, this result in no functional changes. In order for libata to explicitly execute sd_resume() when a device is resumed during system restart, the function scsi_resume_device() is introduced. libata calls this function from the revalidation work executed on devie resume, a state that is indicated with the new device flag ATA_DFLAG_RESUMING. Doing so, locked TCG OPAL enabled devices are unlocked on resume, allowing normal operation. Fixes: 3cc2ffe5c16d ("scsi: sd: Differentiate system and runtime start/stop management") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218538 Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20240319071209.1179257-1-dlemoal@kernel.org Signed-off-by: Martin K. Petersen --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 26d68115afb8..324d792e7c78 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -107,6 +107,7 @@ enum { ATA_DFLAG_NCQ_PRIO_ENABLED = (1 << 20), /* Priority cmds sent to dev */ ATA_DFLAG_CDL_ENABLED = (1 << 21), /* cmd duration limits is enabled */ + ATA_DFLAG_RESUMING = (1 << 22), /* Device is resuming */ ATA_DFLAG_DETACH = (1 << 24), ATA_DFLAG_DETACHED = (1 << 25), ATA_DFLAG_DA = (1 << 26), /* device supports Device Attention */ -- cgit v1.2.3 From c2ddeb29612f7ca84ed10c6d4f3ac99705135447 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 25 Mar 2024 13:58:08 +0100 Subject: genirq: Introduce IRQF_COND_ONESHOT and use it in pinctrl-amd There is a problem when a driver requests a shared interrupt line to run a threaded handler on it without IRQF_ONESHOT set if that flag has been set already for the IRQ in question by somebody else. Namely, the request fails which usually leads to a probe failure even though the driver might have worked just fine with IRQF_ONESHOT, but it does not want to use it by default. Currently, the only way to handle this is to try to request the IRQ without IRQF_ONESHOT, but with IRQF_PROBE_SHARED set and if this fails, try again with IRQF_ONESHOT set. However, this is a bit cumbersome and not very clean. When commit 7a36b901a6eb ("ACPI: OSL: Use a threaded interrupt handler for SCI") switched the ACPI subsystem over to using a threaded interrupt handler for the SCI, it had to use IRQF_ONESHOT for it because that's required due to the way the SCI handler works (it needs to walk all of the enabled GPEs before the interrupt line can be unmasked). The SCI interrupt line is not shared with other users very often due to the SCI handling overhead, but on sone systems it is shared and when the other user of it attempts to install a threaded handler, a flags mismatch related to IRQF_ONESHOT may occur. As it turned out, that happened to the pinctrl-amd driver and so commit 4451e8e8415e ("pinctrl: amd: Add IRQF_ONESHOT to the interrupt request") attempted to address the issue by adding IRQF_ONESHOT to the interrupt flags in that driver, but this is now causing an IRQF_ONESHOT-related mismatch to occur on another system which cannot boot as a result of it. Clearly, pinctrl-amd can work with IRQF_ONESHOT if need be, but it should not set that flag by default, so it needs a way to indicate that to the interrupt subsystem. To that end, introdcuce a new interrupt flag, IRQF_COND_ONESHOT, which will only have effect when the IRQ line is shared and IRQF_ONESHOT has been set for it already, in which case it will be promoted to the latter. This is sufficient for drivers sharing the interrupt line with the SCI as it is requested by the ACPI subsystem before any drivers are probed, so they will always see IRQF_ONESHOT set for the interrupt in question. Fixes: 4451e8e8415e ("pinctrl: amd: Add IRQF_ONESHOT to the interrupt request") Reported-by: Francisco Ayala Le Brun Signed-off-by: Rafael J. Wysocki Signed-off-by: Thomas Gleixner Reviewed-by: Linus Walleij Cc: 6.8+ # 6.8+ Closes: https://lore.kernel.org/lkml/CAN-StX1HqWqi+YW=t+V52-38Mfp5fAz7YHx4aH-CQjgyNiKx3g@mail.gmail.com/ Link: https://lore.kernel.org/r/12417336.O9o76ZdvQC@kreacher --- include/linux/interrupt.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 76121c2bb4f8..5c9bdd3ffccc 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -67,6 +67,8 @@ * later. * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers, * depends on IRQF_PERCPU. + * IRQF_COND_ONESHOT - Agree to do IRQF_ONESHOT if already set for a shared + * interrupt. */ #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 @@ -82,6 +84,7 @@ #define IRQF_COND_SUSPEND 0x00040000 #define IRQF_NO_AUTOEN 0x00080000 #define IRQF_NO_DEBUG 0x00100000 +#define IRQF_COND_ONESHOT 0x00200000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) -- cgit v1.2.3 From 52464f59a361a3ba49d6eabc4f65d5c0b9d1de39 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 25 Mar 2024 17:00:57 +0000 Subject: gpiolib: Add stubs for GPIO lookup functions The gpio_device_find_by_() functions do not have stubs which means that if they are referenced from code with an optiona dependency on gpiolib then the code will fail to link. Add stubs for lookups via fwnode and label. I have not added a stub for plain gpio_device_find() since it seems harder to see a use case for that which does not depend on gpiolib. With the addition of the GPIO reset controller (which lacks a gpiolib dependency) to the arm64 defconfig this is causing build breaks for arm64 virtconfig in -next: aarch64-linux-gnu-ld: drivers/reset/core.o: in function `__reset_add_reset_gpio_lookup': /build/stage/linux/drivers/reset/core.c:861:(.text+0xccc): undefined reference to `gpio_device_find_by_fwnode' Signed-off-by: Mark Brown Reviewed-by: Krzysztof Kozlowski Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index dc75f802e284..f8617eaf08ba 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -646,8 +646,6 @@ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, struct gpio_device *gpio_device_find(const void *data, int (*match)(struct gpio_chip *gc, const void *data)); -struct gpio_device *gpio_device_find_by_label(const char *label); -struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode); struct gpio_device *gpio_device_get(struct gpio_device *gdev); void gpio_device_put(struct gpio_device *gdev); @@ -814,6 +812,9 @@ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc); int gpio_device_get_base(struct gpio_device *gdev); const char *gpio_device_get_label(struct gpio_device *gdev); +struct gpio_device *gpio_device_find_by_label(const char *label); +struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode); + #else /* CONFIG_GPIOLIB */ #include @@ -843,6 +844,18 @@ static inline const char *gpio_device_get_label(struct gpio_device *gdev) return NULL; } +static inline struct gpio_device *gpio_device_find_by_label(const char *label) +{ + WARN_ON(1); + return NULL; +} + +static inline struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode) +{ + WARN_ON(1); + return NULL; +} + static inline int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset) { -- cgit v1.2.3 From 9cecde80aae0fb0aa44425575d5aca71bc646d89 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Mar 2024 14:08:21 +0000 Subject: mm: increase folio batch size On a 104 thread, 2 socket Skylake system, Intel report a 4.7% performance reduction with will-it-scale page_fault2. This was due to reducing the size of the batch from 32 to 15. Increasing the folio batch size from 15 to 31 gives a performance increase of 12.5% relative to the original, or 17.2% relative to the reduced performance commit. The penalty of this commit is an additional 128 bytes of stack usage. Six folio_batches are also allocated from percpu memory in cpu_fbatches so that will be an additional 768 bytes of percpu memory (per CPU). Tim Chen originally submitted a patch like this in 2020: https://lore.kernel.org/linux-mm/d1cc9f12a8ad6c2a52cb600d93b06b064f2bbc57.1593205965.git.tim.c.chen@linux.intel.com/ Link: https://lkml.kernel.org/r/20240315140823.2478146-1-willy@infradead.org Fixes: 99fbb6bfc16f ("mm: make folios_put() the basis of release_pages()") Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Yujie Liu Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403151058.7048f6a8-oliver.sang@intel.com Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index fcc06c300a72..5d3a0cccc6bf 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -11,8 +11,8 @@ #include -/* 15 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 15 +/* 31 pointers + header align the folio_batch structure to a power of two */ +#define PAGEVEC_SIZE 31 struct folio; -- cgit v1.2.3 From d5aad4c2ca057e760a92a9a7d65bd38d72963f27 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 26 Feb 2024 17:35:41 -0800 Subject: prctl: generalize PR_SET_MDWE support check to be per-arch Patch series "ARM: prctl: Reject PR_SET_MDWE where not supported". I noticed after a recent kernel update that my ARM926 system started segfaulting on any execve() after calling prctl(PR_SET_MDWE). After some investigation it appears that ARMv5 is incapable of providing the appropriate protections for MDWE, since any readable memory is also implicitly executable. The prctl_set_mdwe() function already had some special-case logic added disabling it on PARISC (commit 793838138c15, "prctl: Disable prctl(PR_SET_MDWE) on parisc"); this patch series (1) generalizes that check to use an arch_*() function, and (2) adds a corresponding override for ARM to disable MDWE on pre-ARMv6 CPUs. With the series applied, prctl(PR_SET_MDWE) is rejected on ARMv5 and subsequent execve() calls (as well as mmap(PROT_READ|PROT_WRITE)) can succeed instead of unconditionally failing; on ARMv6 the prctl works as it did previously. [0] https://lore.kernel.org/all/2023112456-linked-nape-bf19@gregkh/ This patch (of 2): There exist systems other than PARISC where MDWE may not be feasible to support; rather than cluttering up the generic code with additional arch-specific logic let's add a generic function for checking MDWE support and allow each arch to override it as needed. Link: https://lkml.kernel.org/r/20240227013546.15769-4-zev@bewilderbeest.net Link: https://lkml.kernel.org/r/20240227013546.15769-5-zev@bewilderbeest.net Signed-off-by: Zev Weiss Acked-by: Helge Deller [parisc] Cc: Borislav Petkov Cc: David Hildenbrand Cc: Florent Revest Cc: "James E.J. Bottomley" Cc: Josh Triplett Cc: Kees Cook Cc: Miguel Ojeda Cc: Mike Rapoport (IBM) Cc: Oleg Nesterov Cc: Ondrej Mosnacek Cc: Rick Edgecombe Cc: Russell King (Oracle) Cc: Sam James Cc: Stefan Roesch Cc: Yang Shi Cc: Yin Fengwei Cc: [6.3+] Signed-off-by: Andrew Morton --- include/linux/mman.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mman.h b/include/linux/mman.h index dc7048824be8..bcb201ab7a41 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -162,6 +162,14 @@ calc_vm_flag_bits(unsigned long flags) unsigned long vm_commit_limit(void); +#ifndef arch_memory_deny_write_exec_supported +static inline bool arch_memory_deny_write_exec_supported(void) +{ + return true; +} +#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported +#endif + /* * Denies creating a writable executable mapping or gaining executable permissions. * -- cgit v1.2.3 From ea2c09283b44d1a3732a195a9b257d56779c8863 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 25 Mar 2024 09:25:05 +0100 Subject: net: wan: framer: Add missing static inline qualifiers Compilation with CONFIG_GENERIC_FRAMER disabled lead to the following warnings: framer.h:184:16: warning: no previous prototype for function 'framer_get' [-Wmissing-prototypes] 184 | struct framer *framer_get(struct device *dev, const char *con_id) framer.h:184:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 184 | struct framer *framer_get(struct device *dev, const char *con_id) framer.h:189:6: warning: no previous prototype for function 'framer_put' [-Wmissing-prototypes] 189 | void framer_put(struct device *dev, struct framer *framer) framer.h:189:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 189 | void framer_put(struct device *dev, struct framer *framer) Add missing 'static inline' qualifiers for these functions. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403241110.hfJqeJRu-lkp@intel.com/ Fixes: 82c944d05b1a ("net: wan: Add framer framework support") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/framer/framer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/framer/framer.h b/include/linux/framer/framer.h index 9a9b88962c29..2b85fe9e7f9a 100644 --- a/include/linux/framer/framer.h +++ b/include/linux/framer/framer.h @@ -181,12 +181,12 @@ static inline int framer_notifier_unregister(struct framer *framer, return -ENOSYS; } -struct framer *framer_get(struct device *dev, const char *con_id) +static inline struct framer *framer_get(struct device *dev, const char *con_id) { return ERR_PTR(-ENOSYS); } -void framer_put(struct device *dev, struct framer *framer) +static inline void framer_put(struct device *dev, struct framer *framer) { } -- cgit v1.2.3 From 18685451fc4e546fc0e718580d32df3c0e5c8272 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Mar 2024 11:18:41 +0100 Subject: inet: inet_defrag: prevent sk release while still in use ip_local_out() and other functions can pass skb->sk as function argument. If the skb is a fragment and reassembly happens before such function call returns, the sk must not be released. This affects skb fragments reassembled via netfilter or similar modules, e.g. openvswitch or ct_act.c, when run as part of tx pipeline. Eric Dumazet made an initial analysis of this bug. Quoting Eric: Calling ip_defrag() in output path is also implying skb_orphan(), which is buggy because output path relies on sk not disappearing. A relevant old patch about the issue was : 8282f27449bf ("inet: frag: Always orphan skbs inside ip_defrag()") [..] net/ipv4/ip_output.c depends on skb->sk being set, and probably to an inet socket, not an arbitrary one. If we orphan the packet in ipvlan, then downstream things like FQ packet scheduler will not work properly. We need to change ip_defrag() to only use skb_orphan() when really needed, ie whenever frag_list is going to be used. Eric suggested to stash sk in fragment queue and made an initial patch. However there is a problem with this: If skb is refragmented again right after, ip_do_fragment() will copy head->sk to the new fragments, and sets up destructor to sock_wfree. IOW, we have no choice but to fix up sk_wmem accouting to reflect the fully reassembled skb, else wmem will underflow. This change moves the orphan down into the core, to last possible moment. As ip_defrag_offset is aliased with sk_buff->sk member, we must move the offset into the FRAG_CB, else skb->sk gets clobbered. This allows to delay the orphaning long enough to learn if the skb has to be queued or if the skb is completing the reasm queue. In the former case, things work as before, skb is orphaned. This is safe because skb gets queued/stolen and won't continue past reasm engine. In the latter case, we will steal the skb->sk reference, reattach it to the head skb, and fix up wmem accouting when inet_frag inflates truesize. Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().") Diagnosed-by: Eric Dumazet Reported-by: xingwei lee Reported-by: yue sun Reported-by: syzbot+e5167d7144a62715044c@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240326101845.30836-1-fw@strlen.de Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0c7c67b3a87b..9d24aec064e8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -753,8 +753,6 @@ typedef unsigned char *sk_buff_data_t; * @list: queue head * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by - * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in - * fragmentation management * @dev: Device we arrived on/are leaving by * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here @@ -875,10 +873,7 @@ struct sk_buff { struct llist_node ll_node; }; - union { - struct sock *sk; - int ip_defrag_offset; - }; + struct sock *sk; union { ktime_t tstamp; -- cgit v1.2.3